The following patch series integrates the Remus control layer into Xen. It provides a single user-visible script ("remus") to activate Remus on a guest virtual machine, and the libraries required by that script. Network buffering requires the linux IMQ (http://linuximq.net) patch to be applied to dom0. I''ll mail the upstream version that applies to the linux-2.6.18-xen.hg tree separately. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Brendan Cully
2009-Nov-13 01:10 UTC
[Xen-devel] [PATCH 1 of 3] Remus: add python control extensions
# HG changeset patch # User Brendan Cully <brendan@cs.ubc.ca> # Date 1258073720 28800 # Node ID 213fb814acf431d2a382e8f9c09b4cea106c0958 # Parent accded2f185f4178f875b170a5c01544648a68d2 Remus: add python control extensions Signed-off-by: Brendan Cully <brendan@cs.ubc.ca> diff --git a/tools/python/setup.py b/tools/python/setup.py --- a/tools/python/setup.py +++ b/tools/python/setup.py @@ -67,10 +67,28 @@ libraries = libraries, sources = [ "ptsname/ptsname.c" ]) +checkpoint = Extension("checkpoint", + extra_compile_args = extra_compile_args, + include_dirs = include_dirs, + library_dirs = library_dirs, + libraries = libraries + [ "rt" ], + sources = [ "xen/lowlevel/checkpoint/checkpoint.c", + "xen/lowlevel/checkpoint/libcheckpoint.c"]) + +netlink = Extension("netlink", + extra_compile_args = extra_compile_args, + include_dirs = include_dirs, + library_dirs = library_dirs, + libraries = libraries, + sources = [ "xen/lowlevel/netlink/netlink.c", + "xen/lowlevel/netlink/libnetlink.c"]) + modules = [ xc, xs, ptsname, acm, flask ] -if os.uname()[0] == ''SunOS'': - modules.append(scf) - modules.append(process) +plat = os.uname()[0] +if plat == ''SunOS'': + modules.extend([ scf, process ]) +if plat == ''Linux'': + modules.extend([ checkpoint, netlink ]) setup(name = ''xen'', version = ''3.0'', @@ -89,6 +107,7 @@ ''xen.web'', ''xen.sv'', ''xen.xsview'', + ''xen.remus'', ''xen.xend.tests'', ''xen.xend.server.tests'', diff --git a/tools/python/xen/lowlevel/checkpoint/checkpoint.c b/tools/python/xen/lowlevel/checkpoint/checkpoint.c new file mode 100644 --- /dev/null +++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.c @@ -0,0 +1,363 @@ +/* python bridge to checkpointing API */ + +#include <Python.h> + +#include <xs.h> +#include <xenctrl.h> + +#include "checkpoint.h" + +#define PKG "xen.lowlevel.checkpoint" + +static PyObject* CheckpointError; + +typedef struct { + PyObject_HEAD + checkpoint_state cps; + + /* milliseconds between checkpoints */ + unsigned int interval; + int armed; + + PyObject* suspend_cb; + PyObject* postcopy_cb; + PyObject* checkpoint_cb; + + PyThreadState* threadstate; +} CheckpointObject; + +static int suspend_trampoline(void* data); +static int postcopy_trampoline(void* data); +static int checkpoint_trampoline(void* data); + +static PyObject* Checkpoint_new(PyTypeObject* type, PyObject* args, + PyObject* kwargs) +{ + CheckpointObject* self = (CheckpointObject*)type->tp_alloc(type, 0); + + if (!self) + return NULL; + + checkpoint_init(&self->cps); + self->suspend_cb = NULL; + self->armed = 0; + + return (PyObject*)self; +} + +static int Checkpoint_init(PyObject* obj, PyObject* args, PyObject* kwargs) +{ + return 0; +} + +static void Checkpoint_dealloc(CheckpointObject* self) +{ + checkpoint_close(&self->cps); + + self->ob_type->tp_free((PyObject*)self); +} + +static PyObject* pycheckpoint_open(PyObject* obj, PyObject* args) +{ + CheckpointObject* self = (CheckpointObject*)obj; + checkpoint_state* cps = &self->cps; + unsigned int domid; + + if (!PyArg_ParseTuple(args, "I", &domid)) + return NULL; + + if (checkpoint_open(cps, domid) < 0) { + PyErr_SetString(CheckpointError, checkpoint_error(cps)); + + return NULL; + } + + Py_RETURN_NONE; +} + +static PyObject* pycheckpoint_close(PyObject* obj, PyObject* args) +{ + CheckpointObject* self = (CheckpointObject*)obj; + + checkpoint_close(&self->cps); + + Py_XDECREF(self->suspend_cb); + self->suspend_cb = NULL; + Py_XDECREF(self->postcopy_cb); + self->postcopy_cb = NULL; + Py_XDECREF(self->checkpoint_cb); + self->checkpoint_cb = NULL; + + Py_RETURN_NONE; +} + +static PyObject* pycheckpoint_start(PyObject* obj, PyObject* args) { + CheckpointObject* self = (CheckpointObject*)obj; + + PyObject* iofile; + PyObject* suspend_cb = NULL; + PyObject* postcopy_cb = NULL; + PyObject* checkpoint_cb = NULL; + unsigned int interval = 0; + + int fd; + struct save_callbacks callbacks; + int rc; + + if (!PyArg_ParseTuple(args, "O|OOOI", &iofile, &suspend_cb, &postcopy_cb, + &checkpoint_cb, &interval)) + return NULL; + + self->interval = interval; + + Py_INCREF(iofile); + Py_XINCREF(suspend_cb); + Py_XINCREF(postcopy_cb); + Py_XINCREF(checkpoint_cb); + + fd = PyObject_AsFileDescriptor(iofile); + Py_DECREF(iofile); + if (fd < 0) { + PyErr_SetString(PyExc_TypeError, "invalid file handle"); + return NULL; + } + + if (suspend_cb && suspend_cb != Py_None) { + if (!PyCallable_Check(suspend_cb)) { + PyErr_SetString(PyExc_TypeError, "suspend callback not callable"); + goto err; + } + self->suspend_cb = suspend_cb; + } else + self->suspend_cb = NULL; + + if (postcopy_cb && postcopy_cb != Py_None) { + if (!PyCallable_Check(postcopy_cb)) { + PyErr_SetString(PyExc_TypeError, "postcopy callback not callable"); + return NULL; + } + self->postcopy_cb = postcopy_cb; + } else + self->postcopy_cb = NULL; + + if (checkpoint_cb && checkpoint_cb != Py_None) { + if (!PyCallable_Check(checkpoint_cb)) { + PyErr_SetString(PyExc_TypeError, "checkpoint callback not callable"); + return NULL; + } + self->checkpoint_cb = checkpoint_cb; + } else + self->checkpoint_cb = NULL; + + callbacks.suspend = suspend_trampoline; + callbacks.postcopy = postcopy_trampoline; + callbacks.checkpoint = checkpoint_trampoline; + callbacks.data = self; + + self->threadstate = PyEval_SaveThread(); + rc = checkpoint_start(&self->cps, fd, &callbacks); + PyEval_RestoreThread(self->threadstate); + + if (rc < 0) { + PyErr_SetString(CheckpointError, checkpoint_error(&self->cps)); + goto err; + } + + Py_RETURN_NONE; + + err: + self->suspend_cb = NULL; + Py_XDECREF(suspend_cb); + self->postcopy_cb = NULL; + Py_XDECREF(postcopy_cb); + self->checkpoint_cb = NULL; + Py_XDECREF(checkpoint_cb); + + return NULL; +} + +static PyMethodDef Checkpoint_methods[] = { + { "open", pycheckpoint_open, METH_VARARGS, + "open connection to xen" }, + { "close", pycheckpoint_close, METH_NOARGS, + "close connection to xen" }, + { "start", pycheckpoint_start, METH_VARARGS | METH_KEYWORDS, + "begin a checkpoint" }, + { NULL, NULL, 0, NULL } +}; + +static PyTypeObject CheckpointType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + PKG ".checkpointer", /* tp_name */ + sizeof(CheckpointObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Checkpoint_dealloc, /* tp_dealloc */ + NULL, /* tp_print */ + NULL, /* tp_getattr */ + NULL, /* tp_setattr */ + NULL, /* tp_compare */ + NULL, /* tp_repr */ + NULL, /* tp_as_number */ + NULL, /* tp_as_sequence */ + NULL, /* tp_as_mapping */ + NULL, /* tp_hash */ + NULL, /* tp_call */ + NULL, /* tp_str */ + NULL, /* tp_getattro */ + NULL, /* tp_setattro */ + NULL, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Checkpoint object", /* tp_doc */ + NULL, /* tp_traverse */ + NULL, /* tp_clear */ + NULL, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + NULL, /* tp_iter */ + NULL, /* tp_iternext */ + Checkpoint_methods, /* tp_methods */ + NULL, /* tp_members */ + NULL, /* tp_getset */ + NULL, /* tp_base */ + NULL, /* tp_dict */ + NULL, /* tp_descr_get */ + NULL, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Checkpoint_init, /* tp_init */ + NULL, /* tp_alloc */ + Checkpoint_new, /* tp_new */ +}; + +static PyMethodDef methods[] = { + { NULL } +}; + +static char doc[] = "checkpoint API"; + +PyMODINIT_FUNC initcheckpoint(void) { + PyObject *m; + + if (PyType_Ready(&CheckpointType) < 0) + return; + + m = Py_InitModule3(PKG, methods, doc); + + if (!m) + return; + + Py_INCREF(&CheckpointType); + PyModule_AddObject(m, "checkpointer", (PyObject*)&CheckpointType); + + CheckpointError = PyErr_NewException(PKG ".error", NULL, NULL); + Py_INCREF(CheckpointError); + PyModule_AddObject(m, "error", CheckpointError); + + block_timer(); +} + +/* private functions */ + +/* bounce C suspend call into python equivalent. + * returns 1 on success or 0 on failure */ +static int suspend_trampoline(void* data) +{ + CheckpointObject* self = (CheckpointObject*)data; + + PyObject* result; + + /* call default suspend function, then python hook if available */ + if (self->armed) { + if (checkpoint_wait(&self->cps) < 0) { + fprintf(stderr, "%s\n", checkpoint_error(&self->cps)); + return 0; + } + } else { + if (self->interval) { + self->armed = 1; + checkpoint_settimer(&self->cps, self->interval); + } + + if (!checkpoint_suspend(&self->cps)) { + fprintf(stderr, "%s\n", checkpoint_error(&self->cps)); + return 0; + } + } + + if (!self->suspend_cb) + return 1; + + PyEval_RestoreThread(self->threadstate); + result = PyObject_CallFunction(self->suspend_cb, NULL); + self->threadstate = PyEval_SaveThread(); + + if (!result) + return 0; + + if (result == Py_None || PyObject_IsTrue(result)) { + Py_DECREF(result); + return 1; + } + + Py_DECREF(result); + + return 0; +} + +static int postcopy_trampoline(void* data) +{ + CheckpointObject* self = (CheckpointObject*)data; + + PyObject* result; + int rc = 0; + + if (!self->postcopy_cb) + goto resume; + + PyEval_RestoreThread(self->threadstate); + result = PyObject_CallFunction(self->postcopy_cb, NULL); + + if (result && (result == Py_None || PyObject_IsTrue(result))) + rc = 1; + + Py_XDECREF(result); + self->threadstate = PyEval_SaveThread(); + + resume: + if (checkpoint_resume(&self->cps) < 0) { + fprintf(stderr, "%s\n", checkpoint_error(&self->cps)); + return 0; + } + + return rc; +} + +static int checkpoint_trampoline(void* data) +{ + CheckpointObject* self = (CheckpointObject*)data; + + PyObject* result; + + if (checkpoint_postflush(&self->cps) < 0) { + fprintf(stderr, "%s\n", checkpoint_error(&self->cps)); + return -1; + } + + if (!self->checkpoint_cb) + return 0; + + PyEval_RestoreThread(self->threadstate); + result = PyObject_CallFunction(self->checkpoint_cb, NULL); + self->threadstate = PyEval_SaveThread(); + + if (!result) + return 0; + + if (result == Py_None || PyObject_IsTrue(result)) { + Py_DECREF(result); + return 1; + } + + Py_DECREF(result); + + return 0; +} diff --git a/tools/python/xen/lowlevel/checkpoint/checkpoint.h b/tools/python/xen/lowlevel/checkpoint/checkpoint.h new file mode 100644 --- /dev/null +++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.h @@ -0,0 +1,59 @@ +/* API for checkpointing */ + +#ifndef _CHECKPOINT_H_ +#define _CHECKPOINT_H_ 1 + +#include <pthread.h> +#include <semaphore.h> +#include <time.h> + +#include <xenguest.h> +#include <xs.h> + +typedef enum { + dt_unknown, + dt_pv, + dt_hvm, + dt_pvhvm /* HVM with PV drivers */ +} checkpoint_domtype; + +typedef struct { + int xch; /* xc handle */ + int xce; /* event channel handle */ + struct xs_handle* xsh; /* xenstore handle */ + int watching_shutdown; /* state of watch on @releaseDomain */ + + unsigned int domid; + checkpoint_domtype domtype; + int fd; + + int suspend_evtchn; + + char* errstr; + + /* suspend deadline thread support */ + volatile int suspended; + volatile int done; + pthread_t suspend_thr; + sem_t suspended_sem; + sem_t resumed_sem; + timer_t timer; +} checkpoint_state; + +char* checkpoint_error(checkpoint_state* s); + +void checkpoint_init(checkpoint_state* s); +int checkpoint_open(checkpoint_state* s, unsigned int domid); +void checkpoint_close(checkpoint_state* s); +int checkpoint_start(checkpoint_state* s, int fd, + struct save_callbacks* callbacks); +int checkpoint_suspend(checkpoint_state* s); +int checkpoint_resume(checkpoint_state* s); +int checkpoint_postflush(checkpoint_state* s); + +int checkpoint_settimer(checkpoint_state* s, int millis); +int checkpoint_wait(checkpoint_state* s); +void block_timer(void); +void unblock_timer(void); + +#endif diff --git a/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c b/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c new file mode 100644 --- /dev/null +++ b/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c @@ -0,0 +1,782 @@ +/* API for checkpointing */ + +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <signal.h> +#include <sys/stat.h> + +#include <xenctrl.h> +#include <xenguest.h> +#include <xs.h> + +#include "checkpoint.h" + +static char errbuf[256]; + +static int setup_suspend_evtchn(checkpoint_state* s); +static void release_suspend_evtchn(checkpoint_state *s); +static int setup_shutdown_watch(checkpoint_state* s); +static int check_shutdown_watch(checkpoint_state* s); +static void release_shutdown_watch(checkpoint_state* s); +static int poll_evtchn(checkpoint_state* s); + +static int switch_qemu_logdirty(checkpoint_state* s, int enable); +static int suspend_hvm(checkpoint_state* s); +static int suspend_qemu(checkpoint_state* s); +static int resume_qemu(checkpoint_state* s); +static int send_qemu(checkpoint_state* s); + +static int create_suspend_timer(checkpoint_state* s); +static int delete_suspend_timer(checkpoint_state* s); +static int create_suspend_thread(checkpoint_state* s); +static void stop_suspend_thread(checkpoint_state* s); + +/* Returns a string describing the most recent error returned by + * a checkpoint function. Static -- do not free. */ +char* checkpoint_error(checkpoint_state* s) +{ + return s->errstr; +} + +void checkpoint_init(checkpoint_state* s) +{ + s->xch = -1; + s->xce = -1; + s->xsh = NULL; + s->watching_shutdown = 0; + + s->domid = 0; + s->domtype = dt_unknown; + s->fd = -1; + + s->suspend_evtchn = -1; + + s->errstr = NULL; + + s->suspended = 0; + s->done = 0; + s->suspend_thr = 0; + s->timer = 0; +} + +/* open a checkpoint session to guest domid */ +int checkpoint_open(checkpoint_state* s, unsigned int domid) +{ + xc_dominfo_t dominfo; + unsigned long pvirq; + + s->domid = domid; + + s->xch = xc_interface_open(); + if (s->xch < 0) { + s->errstr = "could not open control interface (are you root?)"; + + return -1; + } + + s->xsh = xs_daemon_open(); + if (!s->xsh) { + checkpoint_close(s); + s->errstr = "could not open xenstore handle"; + + return -1; + } + + s->xce = xc_evtchn_open(); + if (s->xce < 0) { + checkpoint_close(s); + s->errstr = "could not open event channel handle"; + + return -1; + } + + if (xc_domain_getinfo(s->xch, s->domid, 1, &dominfo) < 0) { + checkpoint_close(s); + s->errstr = "could not get domain info"; + + return -1; + } + if (dominfo.hvm) { + if (xc_get_hvm_param(s->xch, s->domid, HVM_PARAM_CALLBACK_IRQ, &pvirq)) { + checkpoint_close(s); + s->errstr = "could not get HVM callback IRQ"; + + return -1; + } + s->domtype = pvirq ? dt_pvhvm : dt_hvm; + } else + s->domtype = dt_pv; + + if (setup_shutdown_watch(s) < 0) { + checkpoint_close(s); + + return -1; + } + + if (s->domtype == dt_pv) { + if (setup_suspend_evtchn(s) < 0) { + checkpoint_close(s); + + return -1; + } + } else if (s->domtype == dt_pvhvm) { + checkpoint_close(s); + s->errstr = "PV-on-HVM is unsupported"; + + return -1; + } + + return 0; +} + +void checkpoint_close(checkpoint_state* s) +{ + if (s->timer) + delete_suspend_timer(s); + if (s->suspend_thr) + stop_suspend_thread(s); + + release_shutdown_watch(s); + release_suspend_evtchn(s); + + if (s->xch >= 0) { + xc_interface_close(s->xch); + s->xch = -1; + } + if (s->xce >= 0) { + xc_evtchn_close(s->xce); + s->xce = -1; + } + if (s->xsh) { + xs_daemon_close(s->xsh); + s->xsh = NULL; + } + + s->domid = 0; + s->fd = -1; + s->suspend_evtchn = -1; +} + +/* we toggle logdirty ourselves around the xc_domain_save call -- + * it avoids having to pass around checkpoint_state */ +static void noop_switch_logdirty(int domid, unsigned enable) +{ + return; +} + +int checkpoint_start(checkpoint_state* s, int fd, + struct save_callbacks* callbacks) +{ + int hvm, rc; + int flags = XCFLAGS_LIVE; + + if (!s->domid) { + s->errstr = "checkpoint state not opened"; + return -1; + } + + s->fd = fd; + + hvm = s->domtype > dt_pv; + if (hvm) { + flags |= XCFLAGS_HVM; + if ((rc = switch_qemu_logdirty(s, 1))) + return rc; + } + + rc = xc_domain_save(s->xch, fd, s->domid, 0, 0, flags, callbacks, hvm, + noop_switch_logdirty); + + if (hvm) + switch_qemu_logdirty(s, 0); + + return rc; +} + +/* suspend the domain. Returns 0 on failure, 1 on success */ +int checkpoint_suspend(checkpoint_state* s) +{ + struct timeval tv; + int rc; + + gettimeofday(&tv, NULL); + fprintf(stderr, "PROF: suspending at %lu.%06lu\n", (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_usec); + + if (s->domtype == dt_hvm) { + return suspend_hvm(s) < 0 ? 0 : 1; + } + + rc = xc_evtchn_notify(s->xce, s->suspend_evtchn); + if (rc < 0) { + snprintf(errbuf, sizeof(errbuf), + "failed to notify suspend event channel: %d", rc); + s->errstr = errbuf; + + return 0; + } + + do { + rc = poll_evtchn(s); + } while (rc >= 0 && rc != s->suspend_evtchn); + if (rc <= 0) { + snprintf(errbuf, sizeof(errbuf), + "failed to receive suspend notification: %d", rc); + s->errstr = errbuf; + + return 0; + } + if (xc_evtchn_unmask(s->xce, s->suspend_evtchn) < 0) { + snprintf(errbuf, sizeof(errbuf), + "failed to unmask suspend notification channel: %d", rc); + s->errstr = errbuf; + + return 0; + } + + return 1; +} + +/* wait for a suspend to be triggered by another thread */ +int checkpoint_wait(checkpoint_state* s) +{ + int rc; + + if (!s->suspend_thr) { + s->errstr = "checkpoint timer is not active\n"; + return -1; + } + + do { + rc = sem_wait(&s->suspended_sem); + if (rc < 0 && errno != EINTR) { + snprintf(errbuf, sizeof(errbuf), + "error waiting for suspend semaphore: %d %d\n", rc, errno); + s->errstr = errbuf; + return -1; + } + } while (rc < 0); + + if (!s->suspended) { + snprintf(errbuf, sizeof(errbuf), "domain not suspended?\n"); + s->errstr = errbuf; + return -1; + } + + return 0; +} + +/* let guest execution resume */ +int checkpoint_resume(checkpoint_state* s) +{ + struct timeval tv; + int rc; + + if (xc_domain_resume(s->xch, s->domid, 1)) { + snprintf(errbuf, sizeof(errbuf), "error resuming domain: %d", errno); + s->errstr = errbuf; + + return -1; + } + + gettimeofday(&tv, NULL); + fprintf(stderr, "PROF: resumed at %lu.%06lu\n", (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_usec); + + if (s->domtype > dt_pv && resume_qemu(s) < 0) + return -1; + + /* restore watchability in xenstore */ + if (xs_resume_domain(s->xsh, s->domid) < 0) + fprintf(stderr, "error resuming domain in xenstore\n"); + + s->suspended = 0; + + if (s->suspend_thr) { + if ((rc = sem_post(&s->resumed_sem))) + fprintf(stderr, "error posting resume semaphore\n"); + } + + return 0; +} + +/* called after xc_domain_save has flushed its buffer */ +int checkpoint_postflush(checkpoint_state *s) +{ + if (s->domtype > dt_pv && send_qemu(s) < 0) + return -1; + + return 0; +} + +/* force suspend within millis ms if copy hasn''t completed yet */ +int checkpoint_settimer(checkpoint_state* s, int millis) +{ + struct itimerspec t; + int err; + + if (!s->suspend_thr) { + if (create_suspend_timer(s) < 0) + return -1; + + if (create_suspend_thread(s) < 0) { + delete_suspend_timer(s); + return -1; + } + } + + t.it_value.tv_sec = millis / 1000; + t.it_value.tv_nsec = (millis % 1000) * 1000000L; + t.it_interval.tv_sec = t.it_value.tv_sec; + t.it_interval.tv_nsec = t.it_value.tv_nsec; + + if ((err = timer_settime(s->timer, 0, &t, NULL))) { + fprintf(stderr, "Error arming timer: %d\n", err); + return -1; + } + + return 0; +} + +int delete_suspend_timer(checkpoint_state* s) +{ + int rc = 0; + + if (s->timer) { + if ((rc = timer_delete(s->timer))) + fprintf(stderr, "Error deleting timer: %s\n", strerror(errno)); + s->timer = NULL; + } + + return rc; +} + +/* Set up event channel used to signal a guest to suspend itself */ +static int setup_suspend_evtchn(checkpoint_state* s) +{ + int port; + + port = xs_suspend_evtchn_port(s->domid); + if (port < 0) { + s->errstr = "failed to read suspend event channel"; + return -1; + } + + s->suspend_evtchn = xc_suspend_evtchn_init(s->xch, s->xce, s->domid, port); + if (s->suspend_evtchn < 0) { + snprintf(errbuf, sizeof(errbuf), "failed to bind suspend event channel"); + s->errstr = errbuf; + + return -1; + } + + fprintf(stderr, "bound to suspend event channel %u:%d as %d\n", s->domid, port, + s->suspend_evtchn); + + return 0; +} + +/* release suspend event channels bound to guest */ +static void release_suspend_evtchn(checkpoint_state *s) +{ + /* TODO: teach xen to clean up if port is unbound */ + if (s->xce >= 0 && s->suspend_evtchn > 0) { + xc_suspend_evtchn_release(s->xce, s->suspend_evtchn); + s->suspend_evtchn = 0; + } +} + +static int setup_shutdown_watch(checkpoint_state* s) +{ + char buf[16]; + + /* write domain ID to watch so we can ignore other domain shutdowns */ + snprintf(buf, sizeof(buf), "%u", s->domid); + if ( !xs_watch(s->xsh, "@releaseDomain", buf) ) { + fprintf(stderr, "Could not bind to shutdown watch\n"); + return -1; + } + /* watch fires once on registration */ + s->watching_shutdown = 1; + check_shutdown_watch(s); + + return 0; +} + +static int check_shutdown_watch(checkpoint_state* s) { + unsigned int count; + char **vec; + char buf[16]; + + vec = xs_read_watch(s->xsh, &count); + if (s->watching_shutdown == 1) { + s->watching_shutdown = 2; + return 0; + } + if (!vec) { + fprintf(stderr, "empty watch fired\n"); + return 0; + } + snprintf(buf, sizeof(buf), "%d", s->domid); + if (!strcmp(vec[XS_WATCH_TOKEN], buf)) { + fprintf(stderr, "domain %d shut down\n", s->domid); + return -1; + } + + return 0; +} + +static void release_shutdown_watch(checkpoint_state* s) { + char buf[16]; + + if (!s->xsh) + return; + + if (!s->watching_shutdown) + return; + + snprintf(buf, sizeof(buf), "%u", s->domid); + if (!xs_unwatch(s->xsh, "@releaseDomain", buf)) + fprintf(stderr, "Could not release shutdown watch\n"); +} + +/* wrapper around xc_evtchn_pending which detects errors */ +static int poll_evtchn(checkpoint_state* s) +{ + int fd, xsfd, maxfd; + fd_set rfds, efds; + struct timeval tv; + int rc; + + fd = xc_evtchn_fd(s->xce); + xsfd = xs_fileno(s->xsh); + maxfd = fd > xsfd ? fd : xsfd; + FD_ZERO(&rfds); + FD_ZERO(&efds); + FD_SET(fd, &rfds); + FD_SET(xsfd, &rfds); + FD_SET(fd, &efds); + FD_SET(xsfd, &efds); + + /* give it 500 ms to respond */ + tv.tv_sec = 0; + tv.tv_usec = 500000; + + rc = select(maxfd + 1, &rfds, NULL, &efds, &tv); + if (rc < 0) + fprintf(stderr, "error polling event channel: %s\n", strerror(errno)); + else if (!rc) + fprintf(stderr, "timeout waiting for event channel\n"); + else if (FD_ISSET(fd, &rfds)) + return xc_evtchn_pending(s->xce); + else if (FD_ISSET(xsfd, &rfds)) + return check_shutdown_watch(s); + + return -1; +} + +/* adapted from the eponymous function in xc_save */ +static int switch_qemu_logdirty(checkpoint_state *s, int enable) +{ + char path[128]; + char *tail, *cmd, *response; + char **vec; + unsigned int len; + + sprintf(path, "/local/domain/0/device-model/%u/logdirty/", s->domid); + tail = path + strlen(path); + + strcpy(tail, "ret"); + if (!xs_watch(s->xsh, path, "qemu-logdirty-ret")) { + s->errstr = "error watching qemu logdirty return"; + return -1; + } + /* null fire. XXX unify with shutdown watch! */ + vec = xs_read_watch(s->xsh, &len); + free(vec); + + strcpy(tail, "cmd"); + cmd = enable ? "enable" : "disable"; + if (!xs_write(s->xsh, XBT_NULL, path, cmd, strlen(cmd))) { + s->errstr = "error signalling qemu logdirty"; + return -1; + } + + vec = xs_read_watch(s->xsh, &len); + free(vec); + + strcpy(tail, "ret"); + xs_unwatch(s->xsh, path, "qemu-logdirty-ret"); + + response = xs_read(s->xsh, XBT_NULL, path, &len); + if (!len || strcmp(response, cmd)) { + if (len) + free(response); + s->errstr = "qemu logdirty command failed"; + return -1; + } + free(response); + fprintf(stderr, "qemu logdirty mode: %s\n", cmd); + + return 0; +} + +static int suspend_hvm(checkpoint_state *s) +{ + int rc = -1; + + fprintf(stderr, "issuing HVM suspend hypercall\n"); + rc = xc_domain_shutdown(s->xch, s->domid, SHUTDOWN_suspend); + if (rc < 0) { + s->errstr = "shutdown hypercall failed"; + return -1; + } + fprintf(stderr, "suspend hypercall returned %d\n", rc); + + if (check_shutdown_watch(s) >= 0) + return -1; + + rc = suspend_qemu(s); + + return rc; +} + +static int suspend_qemu(checkpoint_state *s) +{ + char path[128]; + + fprintf(stderr, "pausing QEMU\n"); + + sprintf(path, "/local/domain/0/device-model/%d/command", s->domid); + if (!xs_write(s->xsh, XBT_NULL, path, "save", 4)) { + fprintf(stderr, "error signalling QEMU to save\n"); + return -1; + } + + sprintf(path, "/local/domain/0/device-model/%d/state", s->domid); + + do { + char* state; + unsigned int len; + + state = xs_read(s->xsh, XBT_NULL, path, &len); + if (!state) { + s->errstr = "error reading QEMU state"; + return -1; + } + + if (!strcmp(state, "paused")) { + free(state); + return 0; + } + + free(state); + usleep(1000); + } while(1); + + return -1; +} + +static int resume_qemu(checkpoint_state *s) +{ + char path[128]; + fprintf(stderr, "resuming QEMU\n"); + + sprintf(path, "/local/domain/0/device-model/%d/command", s->domid); + if (!xs_write(s->xsh, XBT_NULL, path, "continue", 8)) { + fprintf(stderr, "error signalling QEMU to resume\n"); + return -1; + } + + return 0; +} + +static int send_qemu(checkpoint_state *s) +{ + char buf[8192]; + char path[128]; + struct stat sb; + uint32_t qlen = 0; + int qfd; + int rc; + + if (s->fd < 0) + return -1; + + sprintf(path, "/var/lib/xen/qemu-save.%d", s->domid); + + if (stat(path, &sb) < 0) { + snprintf(errbuf, sizeof(errbuf), + "error getting QEMU state file status: %s", strerror(errno)); + s->errstr = errbuf; + return -1; + } + + qlen = sb.st_size; + qfd = open(path, O_RDONLY); + if (qfd < 0) { + snprintf(errbuf, sizeof(errbuf), "error opening QEMU state file: %s", + strerror(errno)); + s->errstr = errbuf; + return -1; + } + + fprintf(stderr, "Sending %u bytes of QEMU state\n", qlen); + if (write(s->fd, "RemusDeviceModelState", 21) != 21) { + s->errstr = "error writing QEMU header"; + close(qfd); + return -1; + } + if (write(s->fd, &qlen, sizeof(qlen)) != sizeof(qlen)) { + s->errstr = "error writing QEMU size"; + close(qfd); + return -1; + } + + while ((rc = read(qfd, buf, qlen > sizeof(buf) ? sizeof(buf) : qlen)) > 0) { + qlen -= rc; + if (write(s->fd, buf, rc) != rc) { + rc = -1; + break; + } + } + if (rc < 0) { + snprintf(errbuf, sizeof(errbuf), "error writing QEMU state: %s", + strerror(errno)); + s->errstr = errbuf; + } + + close(qfd); + + return rc; +} + +/*thread responsible to suspend the domain early if necessary*/ +static void *suspend_thread(void *arg) +{ + checkpoint_state* s = (checkpoint_state*)arg; + sigset_t tss; + int rc; + int sig; + + fprintf(stderr, "Suspend thread started\n"); + + sigemptyset(&tss); + sigaddset(&tss, SIGRTMIN); + + while (1) { + /* wait for checkpoint thread to signal resume */ + if ((rc = sem_wait(&s->resumed_sem))) + fprintf(stderr, "Error waiting on resume semaphore\n"); + + if ((rc = sigwait(&tss, &sig))) { + fprintf(stderr, "sigwait failed: %d %d\n", rc, errno); + break; + } + if (sig != SIGRTMIN) + fprintf(stderr, "received unexpected signal %d\n", sig); + + if (s->done) + break; + + if (s->suspended) { + fprintf(stderr, "domain already suspended?\n"); + } else { + rc = checkpoint_suspend(s); + if (rc) + s->suspended = 1; + else + fprintf(stderr, "checkpoint_suspend failed\n"); + } + + if ((rc = sem_post(&s->suspended_sem))) + fprintf(stderr, "Error posting suspend semaphore\n"); + } + + fprintf(stderr, "Suspend thread exiting\n"); + + return NULL; +} + +static int create_suspend_timer(checkpoint_state* s) +{ + struct sigevent event; + int err; + + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGRTMIN; + event.sigev_value.sival_int = 0; + + if ((err = timer_create(CLOCK_REALTIME, &event, &s->timer))) { + snprintf(errbuf, sizeof(errbuf), "Error creating timer: %d\n", err); + s->errstr = errbuf; + return -1; + } + + return 0; +} + +void block_timer(void) +{ + sigset_t tss; + + sigemptyset(&tss); + sigaddset(&tss, SIGRTMIN); + + pthread_sigmask(SIG_BLOCK, &tss, NULL); +} + +void unblock_timer(void) +{ + sigset_t tss; + + sigemptyset(&tss); + sigaddset(&tss, SIGRTMIN); + + pthread_sigmask(SIG_UNBLOCK, &tss, NULL); +} + +static int create_suspend_thread(checkpoint_state* s) +{ + int err; + + if ((err = sem_init(&s->suspended_sem, 0, 0))) { + snprintf(errbuf, sizeof(errbuf), + "Error initializing suspend semaphore: %d\n", err); + s->errstr = errbuf; + return -1; + } + + if ((err = sem_init(&s->resumed_sem, 0, 0))) { + snprintf(errbuf, sizeof(errbuf), + "Error initializing resume semaphore: %d\n", err); + s->errstr = errbuf; + return -1; + } + + /* signal mask should be inherited */ + block_timer(); + + if ((err = pthread_create(&s->suspend_thr, NULL, suspend_thread, s))) { + snprintf(errbuf, sizeof(errbuf), "Error creating suspend thread: %d\n", err); + s->errstr = errbuf; + return -1; + } + + return 0; +} + +static void stop_suspend_thread(checkpoint_state* s) +{ + int err; + + s->done = 1; + + err = sem_post(&s->resumed_sem); + + err = pthread_join(s->suspend_thr, NULL); + s->suspend_thr = 0; +} diff --git a/tools/python/xen/lowlevel/netlink/libnetlink.c b/tools/python/xen/lowlevel/netlink/libnetlink.c new file mode 100644 --- /dev/null +++ b/tools/python/xen/lowlevel/netlink/libnetlink.c @@ -0,0 +1,585 @@ +/* + * libnetlink.c RTnetlink service routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <net/if_arp.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/uio.h> + +#include "libnetlink.h" + +void rtnl_close(struct rtnl_handle *rth) +{ + close(rth->fd); +} + +int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions, + int protocol) +{ + socklen_t addr_len; + int sndbuf = 32768; + int rcvbuf = 32768; + + memset(rth, 0, sizeof(rth)); + + rth->fd = socket(AF_NETLINK, SOCK_RAW, protocol); + if (rth->fd < 0) { + perror("Cannot open netlink socket"); + return -1; + } + + if (setsockopt(rth->fd,SOL_SOCKET,SO_SNDBUF,&sndbuf,sizeof(sndbuf)) < 0) { + perror("SO_SNDBUF"); + return -1; + } + + if (setsockopt(rth->fd,SOL_SOCKET,SO_RCVBUF,&rcvbuf,sizeof(rcvbuf)) < 0) { + perror("SO_RCVBUF"); + return -1; + } + + memset(&rth->local, 0, sizeof(rth->local)); + rth->local.nl_family = AF_NETLINK; + rth->local.nl_groups = subscriptions; + + if (bind(rth->fd, (struct sockaddr*)&rth->local, sizeof(rth->local)) < 0) { + perror("Cannot bind netlink socket"); + return -1; + } + addr_len = sizeof(rth->local); + if (getsockname(rth->fd, (struct sockaddr*)&rth->local, &addr_len) < 0) { + perror("Cannot getsockname"); + return -1; + } + if (addr_len != sizeof(rth->local)) { + fprintf(stderr, "Wrong address length %d\n", addr_len); + return -1; + } + if (rth->local.nl_family != AF_NETLINK) { + fprintf(stderr, "Wrong address family %d\n", rth->local.nl_family); + return -1; + } + rth->seq = time(NULL); + return 0; +} + +int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions) +{ + return rtnl_open_byproto(rth, subscriptions, NETLINK_ROUTE); +} + +int rtnl_wilddump_request(struct rtnl_handle *rth, int family, int type) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + struct sockaddr_nl nladdr; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = type; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = rth->dump = ++rth->seq; + req.g.rtgen_family = family; + + return sendto(rth->fd, (void*)&req, sizeof(req), 0, + (struct sockaddr*)&nladdr, sizeof(nladdr)); +} + +int rtnl_send(struct rtnl_handle *rth, const char *buf, int len) +{ + struct sockaddr_nl nladdr; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + return sendto(rth->fd, buf, len, 0, (struct sockaddr*)&nladdr, sizeof(nladdr)); +} + +int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len) +{ + struct nlmsghdr nlh; + struct sockaddr_nl nladdr; + struct iovec iov[2] = { + { .iov_base = &nlh, .iov_len = sizeof(nlh) }, + { .iov_base = req, .iov_len = len } + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = iov, + .msg_iovlen = 2, + }; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + nlh.nlmsg_len = NLMSG_LENGTH(len); + nlh.nlmsg_type = type; + nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + nlh.nlmsg_pid = 0; + nlh.nlmsg_seq = rth->dump = ++rth->seq; + + return sendmsg(rth->fd, &msg, 0); +} + +int rtnl_dump_filter(struct rtnl_handle *rth, + rtnl_filter_t filter, + void *arg1, + rtnl_filter_t junk, + void *arg2) +{ + struct sockaddr_nl nladdr; + struct iovec iov; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + char buf[16384]; + + iov.iov_base = buf; + while (1) { + int status; + struct nlmsghdr *h; + + iov.iov_len = sizeof(buf); + status = recvmsg(rth->fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return -1; + } + + h = (struct nlmsghdr*)buf; + while (NLMSG_OK(h, status)) { + int err; + + if (nladdr.nl_pid != 0 || + h->nlmsg_pid != rth->local.nl_pid || + h->nlmsg_seq != rth->dump) { + if (junk) { + err = junk(&nladdr, h, arg2); + if (err < 0) + return err; + } + goto skip_it; + } + + if (h->nlmsg_type == NLMSG_DONE) + return 0; + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); + if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + fprintf(stderr, "ERROR truncated\n"); + } else { + errno = -err->error; + perror("RTNETLINK answers"); + } + return -1; + } + err = filter(&nladdr, h, arg1); + if (err < 0) + return err; + +skip_it: + h = NLMSG_NEXT(h, status); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } +} + +int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer, + unsigned groups, struct nlmsghdr *answer, + rtnl_filter_t junk, + void *jarg) +{ + int status; + unsigned seq; + struct nlmsghdr *h; + struct sockaddr_nl nladdr; + struct iovec iov = { + .iov_base = (void*) n, + .iov_len = n->nlmsg_len + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + char buf[16384]; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = peer; + nladdr.nl_groups = groups; + + n->nlmsg_seq = seq = ++rtnl->seq; + + if (answer == NULL) + n->nlmsg_flags |= NLM_F_ACK; + + status = sendmsg(rtnl->fd, &msg, 0); + + if (status < 0) { + perror("Cannot talk to rtnetlink"); + return -1; + } + + memset(buf,0,sizeof(buf)); + + iov.iov_base = buf; + + while (1) { + iov.iov_len = sizeof(buf); + status = recvmsg(rtnl->fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return -1; + } + if (msg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, "sender address length == %d\n", msg.msg_namelen); + exit(1); + } + for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) { + int err; + int len = h->nlmsg_len; + int l = len - sizeof(*h); + + if (l<0 || len>status) { + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Truncated message\n"); + return -1; + } + fprintf(stderr, "!!!malformed message: len=%d\n", len); + exit(1); + } + + if (nladdr.nl_pid != peer || + h->nlmsg_pid != rtnl->local.nl_pid || + h->nlmsg_seq != seq) { + if (junk) { + err = junk(&nladdr, h, jarg); + if (err < 0) + return err; + } + continue; + } + + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); + if (l < sizeof(struct nlmsgerr)) { + fprintf(stderr, "ERROR truncated\n"); + } else { + errno = -err->error; + if (errno == 0) { + if (answer) + memcpy(answer, h, h->nlmsg_len); + return 0; + } + perror("RTNETLINK answers"); + } + return -1; + } + if (answer) { + memcpy(answer, h, h->nlmsg_len); + return 0; + } + + fprintf(stderr, "Unexpected reply!!!\n"); + + status -= NLMSG_ALIGN(len); + h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len)); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } +} + +int rtnl_listen(struct rtnl_handle *rtnl, + rtnl_filter_t handler, + void *jarg) +{ + int status; + struct nlmsghdr *h; + struct sockaddr_nl nladdr; + struct iovec iov; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + char buf[8192]; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = 0; + nladdr.nl_groups = 0; + + iov.iov_base = buf; + while (1) { + iov.iov_len = sizeof(buf); + status = recvmsg(rtnl->fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return -1; + } + if (msg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, "Sender address length == %d\n", msg.msg_namelen); + exit(1); + } + for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) { + int err; + int len = h->nlmsg_len; + int l = len - sizeof(*h); + + if (l<0 || len>status) { + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Truncated message\n"); + return -1; + } + fprintf(stderr, "!!!malformed message: len=%d\n", len); + exit(1); + } + + err = handler(&nladdr, h, jarg); + if (err < 0) + return err; + + status -= NLMSG_ALIGN(len); + h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len)); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } +} + +int rtnl_from_file(FILE *rtnl, rtnl_filter_t handler, + void *jarg) +{ + int status; + struct sockaddr_nl nladdr; + char buf[8192]; + struct nlmsghdr *h = (void*)buf; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = 0; + nladdr.nl_groups = 0; + + while (1) { + int err, len, type; + int l; + + status = fread(&buf, 1, sizeof(*h), rtnl); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("rtnl_from_file: fread"); + return -1; + } + if (status == 0) + return 0; + + len = h->nlmsg_len; + type= h->nlmsg_type; + l = len - sizeof(*h); + + if (l<0 || len>sizeof(buf)) { + fprintf(stderr, "!!!malformed message: len=%d @%lu\n", + len, ftell(rtnl)); + return -1; + } + + status = fread(NLMSG_DATA(h), 1, NLMSG_ALIGN(l), rtnl); + + if (status < 0) { + perror("rtnl_from_file: fread"); + return -1; + } + if (status < l) { + fprintf(stderr, "rtnl-from_file: truncated message\n"); + return -1; + } + + err = handler(&nladdr, h, jarg); + if (err < 0) + return err; + } +} + +int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data) +{ + int len = RTA_LENGTH(4); + struct rtattr *rta; + if (NLMSG_ALIGN(n->nlmsg_len) + len > maxlen) { + fprintf(stderr,"addattr32: Error! max allowed bound %d exceeded\n",maxlen); + return -1; + } + rta = NLMSG_TAIL(n); + rta->rta_type = type; + rta->rta_len = len; + memcpy(RTA_DATA(rta), &data, 4); + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + len; + return 0; +} + +int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, + int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr *rta; + + if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { + fprintf(stderr, "addattr_l ERROR: message exceeded bound of %d\n",maxlen); + return -1; + } + rta = NLMSG_TAIL(n); + rta->rta_type = type; + rta->rta_len = len; + memcpy(RTA_DATA(rta), data, alen); + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); + return 0; +} + +int addraw_l(struct nlmsghdr *n, int maxlen, const void *data, int len) +{ + if (NLMSG_ALIGN(n->nlmsg_len) + NLMSG_ALIGN(len) > maxlen) { + fprintf(stderr, "addraw_l ERROR: message exceeded bound of %d\n",maxlen); + return -1; + } + + memcpy(NLMSG_TAIL(n), data, len); + memset((void *) NLMSG_TAIL(n) + len, 0, NLMSG_ALIGN(len) - len); + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + NLMSG_ALIGN(len); + return 0; +} + +int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data) +{ + int len = RTA_LENGTH(4); + struct rtattr *subrta; + + if (RTA_ALIGN(rta->rta_len) + len > maxlen) { + fprintf(stderr,"rta_addattr32: Error! max allowed bound %d exceeded\n",maxlen); + return -1; + } + subrta = (struct rtattr*)(((char*)rta) + RTA_ALIGN(rta->rta_len)); + subrta->rta_type = type; + subrta->rta_len = len; + memcpy(RTA_DATA(subrta), &data, 4); + rta->rta_len = NLMSG_ALIGN(rta->rta_len) + len; + return 0; +} + +int rta_addattr_l(struct rtattr *rta, int maxlen, int type, + const void *data, int alen) +{ + struct rtattr *subrta; + int len = RTA_LENGTH(alen); + + if (RTA_ALIGN(rta->rta_len) + RTA_ALIGN(len) > maxlen) { + fprintf(stderr,"rta_addattr_l: Error! max allowed bound %d exceeded\n",maxlen); + return -1; + } + subrta = (struct rtattr*)(((char*)rta) + RTA_ALIGN(rta->rta_len)); + subrta->rta_type = type; + subrta->rta_len = len; + memcpy(RTA_DATA(subrta), data, alen); + rta->rta_len = NLMSG_ALIGN(rta->rta_len) + RTA_ALIGN(len); + return 0; +} + +int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr *) * (max + 1)); + while (RTA_OK(rta, len)) { + if (rta->rta_type <= max) + tb[rta->rta_type] = rta; + rta = RTA_NEXT(rta,len); + } + if (len) + fprintf(stderr, "!!!Deficit %d, rta_len=%d\n", len, rta->rta_len); + return 0; +} + +int parse_rtattr_byindex(struct rtattr *tb[], int max, struct rtattr *rta, int len) +{ + int i = 0; + + memset(tb, 0, sizeof(struct rtattr *) * max); + while (RTA_OK(rta, len)) { + if (rta->rta_type <= max && i < max) + tb[i++] = rta; + rta = RTA_NEXT(rta,len); + } + if (len) + fprintf(stderr, "!!!Deficit %d, rta_len=%d\n", len, rta->rta_len); + return i; +} diff --git a/tools/python/xen/lowlevel/netlink/libnetlink.h b/tools/python/xen/lowlevel/netlink/libnetlink.h new file mode 100644 --- /dev/null +++ b/tools/python/xen/lowlevel/netlink/libnetlink.h @@ -0,0 +1,58 @@ +#ifndef __LIBNETLINK_H__ +#define __LIBNETLINK_H__ 1 + +#include <netinet/in.h> +#include <asm/types.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +struct rtnl_handle +{ + int fd; + struct sockaddr_nl local; + struct sockaddr_nl peer; + __u32 seq; + __u32 dump; +}; + +extern int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions); +extern int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions, int protocol); +extern void rtnl_close(struct rtnl_handle *rth); +extern int rtnl_wilddump_request(struct rtnl_handle *rth, int fam, int type); +extern int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len); + +typedef int (*rtnl_filter_t)(const struct sockaddr_nl *, + struct nlmsghdr *n, void *); +extern int rtnl_dump_filter(struct rtnl_handle *rth, rtnl_filter_t filter, + void *arg1, + rtnl_filter_t junk, + void *arg2); +extern int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer, + unsigned groups, struct nlmsghdr *answer, + rtnl_filter_t junk, + void *jarg); +extern int rtnl_send(struct rtnl_handle *rth, const char *buf, int); + + +extern int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data); +extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen); +extern int addraw_l(struct nlmsghdr *n, int maxlen, const void *data, int len); +extern int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data); +extern int rta_addattr_l(struct rtattr *rta, int maxlen, int type, const void *data, int alen); + +extern int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len); +extern int parse_rtattr_byindex(struct rtattr *tb[], int max, struct rtattr *rta, int len); + +#define parse_rtattr_nested(tb, max, rta) \ + (parse_rtattr((tb), (max), RTA_DATA(rta), RTA_PAYLOAD(rta))) + +extern int rtnl_listen(struct rtnl_handle *, rtnl_filter_t handler, + void *jarg); +extern int rtnl_from_file(FILE *, rtnl_filter_t handler, + void *jarg); + +#define NLMSG_TAIL(nmsg) \ + ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) + +#endif /* __LIBNETLINK_H__ */ + diff --git a/tools/python/xen/lowlevel/netlink/netlink.c b/tools/python/xen/lowlevel/netlink/netlink.c new file mode 100644 --- /dev/null +++ b/tools/python/xen/lowlevel/netlink/netlink.c @@ -0,0 +1,211 @@ +/* python binding to libnetlink */ + +#include <Python.h> +#include "libnetlink.h" + +#define PKG "xen.lowlevel.netlink" + +typedef struct { + PyObject_HEAD + int opened; + struct rtnl_handle rth; +} PyRtnlObject; + +/* todo: subscriptions? */ +static PyObject* PyRtnl_new(PyTypeObject* type, PyObject* args, + PyObject* kwargs) +{ + return type->tp_alloc(type, 0); +} + +static int PyRtnl_init(PyObject* obj, PyObject* args, PyObject* kwargs) +{ + PyRtnlObject* self = (PyRtnlObject*)obj; + + if (rtnl_open(&self->rth, 0) < 0) { + PyErr_SetString(PyExc_IOError, "could not open rtnl handle"); + return -1; + } + + return 0; +} + +static void PyRtnl_dealloc(PyRtnlObject* obj) +{ + PyRtnlObject* self = (PyRtnlObject*)obj; + + rtnl_close(&self->rth); +} + +static PyObject* pyrtnl_talk(PyObject* obj, PyObject* args) +{ + PyRtnlObject* self = (PyRtnlObject*)obj; + char* msg; + int len; + int peer = 0; + int groups = 0; + + if (!PyArg_ParseTuple(args, "s#|ii", &msg, &len, &peer, &groups)) + return NULL; + + if (rtnl_talk(&self->rth, (struct nlmsghdr*)msg, peer, groups, NULL, NULL, + NULL) < 0) + { + PyErr_SetString(PyExc_IOError, "error sending message"); + return NULL; + } + + Py_RETURN_NONE; +} + +static PyObject* pyrtnl_wilddump_request(PyObject* obj, PyObject* args) +{ + PyRtnlObject* self = (PyRtnlObject*)obj; + int family, type; + + if (!PyArg_ParseTuple(args, "ii", &family, &type)) + return NULL; + + if (rtnl_wilddump_request(&self->rth, family, type) < 0) { + PyErr_SetString(PyExc_IOError, "could not send dump request"); + return NULL; + } + + Py_RETURN_NONE; +} + +static PyObject* pyrtnl_dump_request(PyObject* obj, PyObject* args) +{ + PyRtnlObject* self = (PyRtnlObject*)obj; + int type; + char* req; + int len; + + if (!PyArg_ParseTuple(args, "is#", &type, &req, &len)) + return NULL; + + if (rtnl_dump_request(&self->rth, type, req, len) < 0) { + PyErr_SetString(PyExc_IOError, "could not send dump request"); + return NULL; + } + + Py_RETURN_NONE; +} + +/* translate args to python and call python callback */ +static int dump_filter_helper(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg) +{ + PyObject* filter = arg; + PyObject* args; + PyObject* result; + + args = Py_BuildValue("s#s#", who, sizeof(*who), n, n->nlmsg_len); + result = PyObject_CallObject(filter, args); + Py_DECREF(args); + if (!result) + return -1; + + /* result is ignored as long as an exception isn''t raised */ + Py_DECREF(result); + return 0; +} + +static PyObject* pyrtnl_dump_filter(PyObject* obj, PyObject* args) +{ + PyRtnlObject* self = (PyRtnlObject*)obj; + PyObject *filter; + + if (!PyArg_ParseTuple(args, "O:dump_filter", &filter)) + return NULL; + + if (!PyCallable_Check(filter)) { + PyErr_SetString(PyExc_TypeError, "parameter must be callable"); + return NULL; + } + + Py_INCREF(filter); + if (rtnl_dump_filter(&self->rth, dump_filter_helper, filter, NULL, + NULL) < 0) + { + Py_DECREF(filter); + return NULL; + } + Py_DECREF(filter); + + Py_RETURN_NONE; +} + +static PyMethodDef PyRtnl_methods[] = { + { "talk", pyrtnl_talk, METH_VARARGS, + "send a message to rtnetlink and receive a response.\n" }, + { "wilddump_request", pyrtnl_wilddump_request, METH_VARARGS, + "dump objects.\n" }, + { "dump_request", pyrtnl_dump_request, METH_VARARGS, + "start a dump of a particular netlink type.\n" }, + { "dump_filter", pyrtnl_dump_filter, METH_VARARGS, + "iterate over an rtnl dump.\n" }, + { NULL } +}; + +static PyTypeObject PyRtnlType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + PKG ".rtnl", /* tp_name */ + sizeof(PyRtnlObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)PyRtnl_dealloc, /* tp_dealloc */ + NULL, /* tp_print */ + NULL, /* tp_getattr */ + NULL, /* tp_setattr */ + NULL, /* tp_compare */ + NULL, /* tp_repr */ + NULL, /* tp_as_number */ + NULL, /* tp_as_sequence */ + NULL, /* tp_as_mapping */ + NULL, /* tp_hash */ + NULL, /* tp_call */ + NULL, /* tp_str */ + NULL, /* tp_getattro */ + NULL, /* tp_setattro */ + NULL, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "rtnetlink handle", /* tp_doc */ + NULL, /* tp_traverse */ + NULL, /* tp_clear */ + NULL, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + NULL, /* tp_iter */ + NULL, /* tp_iternext */ + PyRtnl_methods, /* tp_methods */ + NULL, /* tp_members */ + NULL, /* tp_getset */ + NULL, /* tp_base */ + NULL, /* tp_dict */ + NULL, /* tp_descr_get */ + NULL, /* tp_descr_set */ + 0, /* tp_dictoffset */ + PyRtnl_init, /* tp_init */ + NULL, /* tp_alloc */ + PyRtnl_new, /* tp_new */ +}; + +static PyMethodDef methods[] = { + { NULL } +}; + +static char doc[] = "libnetlink wrapper"; + +PyMODINIT_FUNC initnetlink(void) +{ + PyObject *mod; + + if (PyType_Ready(&PyRtnlType) == -1) + return; + + if (!(mod = Py_InitModule3(PKG, methods, doc))) + return; + + Py_INCREF(&PyRtnlType); + PyModule_AddObject(mod, "rtnl", (PyObject *)&PyRtnlType); +} diff --git a/tools/python/xen/remus/__init__.py b/tools/python/xen/remus/__init__.py new file mode 100644 diff --git a/tools/python/xen/remus/blkdev.py b/tools/python/xen/remus/blkdev.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/blkdev.py @@ -0,0 +1,31 @@ +handlers = [] + +class BlkDevException(Exception): pass + +class BlkDev(object): + "Object representing a VM block device" + def __init__(self, **props): + self.uname = '''' + if ''dev'' not in props: + raise BlkDevException(''no device'') + #if ''uname'' not in props: + #raise BlkDevException(''no uname'') + if ''mode'' not in props: + raise BlkDevException(''no mode'') + self.__dict__.update(props) + self.dev = props[''dev''].rstrip('':disk'') + + def __str__(self): + return ''%s,%s,%s'' % (self.uname, self.dev, self.mode) + +def register(handler): + "register a block device class with parser" + if handler not in handlers: + handlers.insert(0, handler) + +def parse(props): + "turn a vm device dictionary into a blkdev object" + for handler in handlers: + if handler.handles(**props): + return handler(**props) + return BlkDev(**props) diff --git a/tools/python/xen/remus/image.py b/tools/python/xen/remus/image.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/image.py @@ -0,0 +1,227 @@ +# VM image file manipulation + +import logging, struct + +import vm + +SIGNATURE = ''LinuxGuestRecord'' +LONGLEN = struct.calcsize(''L'') +INTLEN = struct.calcsize(''i'') +PAGE_SIZE = 4096 +# ~0L +P2M_EXT_SIG = 4294967295L +# frames per page +FPP = 1024 +LTAB_MASK = 0xf << 28 +BATCH_SIZE = 1024 +IDXLEN = INTLEN + BATCH_SIZE * LONGLEN + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger() + +class VMParseException(Exception): pass + +class VMImage(object): + def __init__(self, img=None): + """img may be a path or a file object. + If compact is True, apply checkpoints to base image instead + of simply concatenating them. + """ + self.img = img + + self.dom = None + self.fd = None + self.header = None + self.nr_pfns = 0 + # p2m extension header (unparsed) + self.p2mext = None + + if self.img: + self.open(self.img) + + def open(self, img): + if isinstance(img, str): + self.fd = file(img, ''rb'') + else: + self.fd = img + + self.readheader() + + def readheader(self): + sig = self.fd.read(len(SIGNATURE)) + if sig != SIGNATURE: + raise VMParseException("Bad signature in image") + + hlen = self.fd.read(INTLEN) + hlen, = struct.unpack(''!i'', hlen) + + self.header = self.fd.read(hlen) + self.dom = parseheader(self.header) + + def readp2mfl(self): + "read the P2M frame list" + pfnlen = self.fd.read(LONGLEN) + self.nr_pfns, = struct.unpack(''L'', pfnlen) + p2m0 = self.fd.read(LONGLEN) + + p2mhdr = p2m0 + p2m0, = struct.unpack(''L'', p2m0) + if p2m0 == P2M_EXT_SIG: + elen = self.fd.read(INTLEN) + elen, = struct.unpack(''I'', elen) + + self.p2mext = self.fd.read(elen) + + p2m0 = self.fd.read(LONGLEN) + p2m0, = struct.unpack(''L'', p2m0) + p2mfl = [p2m0] + + p2mfle = (self.nr_pfns + FPP - 1)/FPP - 1 + p2ms = self.fd.read(LONGLEN * p2mfle) + p2mfl.extend(struct.unpack(''%dL'' % p2mfle, p2ms)) + + self.p2mfl = p2mfl + + def flush(self): + self.ofd.write(self.tail) + +class Writer(object): + """compress a stream of checkpoints into a single image of the + last checkpoint""" + def __init__(self, fd, compact=False): + self.fd = fd + self.compact = compact + + self.vm = None + self.tail = None + # offset to first batch of pages + self.imgstart = 0 + # PFN mappings + self.pfns = [] + + def __del__(self): + self.close() + + def writeheader(self): + hlen = struct.pack(''!i'', len(self.vm.header)) + header = ''''.join([SIGNATURE, hlen, self.vm.header]) + self.fd.write(header) + + def writep2mfl(self): + p2m = [struct.pack(''L'', self.vm.nr_pfns)] + if self.vm.p2mext: + p2m.extend([struct.pack(''L'', P2M_EXT_SIG), self.vm.p2mext]) + p2m.append(struct.pack(''%dL'' % len(self.vm.p2mfl), *self.vm.p2mfl)) + self.fd.write(''''.join(p2m)) + + def writebatch(self, batch): + def offset(pfn): + isz = (pfn / BATCH_SIZE + 1) * IDXLEN + return self.imgstart + isz + pfn * PAGE_SIZE + + if not self.compact: + return self.fd.write(batch) + + batch = parsebatch(batch) + # sort pages for better disk seek behaviour + batch.sort(lambda x, y: cmp(x[0] & ~LTAB_MASK, y[0] & ~LTAB_MASK)) + + for pfndesc, page in batch: + pfn = pfndesc & ~LTAB_MASK + if pfn > self.vm.nr_pfns: + log.error(''INVALID PFN: %d'' % pfn) + if len(self.pfns) <= pfn: + self.pfns.extend([0] * (pfn - len(self.pfns) + 1)) + self.pfns[pfn] = pfndesc + self.fd.seek(offset(pfn)) + self.fd.write(page) + + #print "max offset: %d, %d" % (len(self.pfns), offset(self.pfns[-1])) + + def writeindex(self): + "Write batch header in front of each page" + hdrlen = INTLEN + BATCH_SIZE * LONGLEN + batches = (len(self.pfns) + BATCH_SIZE - 1) / BATCH_SIZE + + for i in xrange(batches): + offset = self.imgstart + i * (hdrlen + (PAGE_SIZE * BATCH_SIZE)) + pfnoff = i * BATCH_SIZE + # python auto-clamps overreads + pfns = self.pfns[pfnoff:pfnoff + BATCH_SIZE] + + self.fd.seek(offset) + self.fd.write(struct.pack(''i'', len(pfns))) + self.fd.write(struct.pack(''%dL'' % len(pfns), *pfns)) + + def slurp(self, ifd): + """Apply an incremental checkpoint to a loaded image. + accepts a path or a file object.""" + if isinstance(ifd, str): + ifd = file(ifd, ''rb'') + + if not self.vm: + self.vm = VMImage(ifd) + self.writeheader() + + self.vm.readp2mfl() + self.writep2mfl() + self.imgstart = self.fd.tell() + + while True: + l, batch = readbatch(ifd) + if l <= 0: + break + self.writebatch(batch) + self.tail = batch + ifd.read() + + def flush(self): + if self.tail: + self.fd.seek(0, 2) + self.fd.write(self.tail) + if self.compact: + self.writeindex() + self.tail = None + + def close(self): + self.flush() + +def parseheader(header): + "parses a header sexpression" + return vm.parsedominfo(vm.strtosxpr(header)) + +def makeheader(dominfo): + "create an image header from a VM dominfo sxpr" + items = [SIGNATURE] + sxpr = vm.sxprtostr(dominfo) + items.append(struct.pack(''!i'', len(sxpr))) + items.append(sxpr) + return ''''.join(items) + +def readbatch(fd): + batch = [] + batchlen = fd.read(INTLEN) + batch.append(batchlen) + batchlen, = struct.unpack(''i'', batchlen) + log.info("batch length: %d" % batchlen) + if batchlen <= 0: + return (batchlen, batch[0]) + + batchfns = fd.read(LONGLEN * batchlen) + batch.append(batchfns) + pages = fd.read(PAGE_SIZE * batchlen) + if len(pages) != PAGE_SIZE * batchlen: + log.error(''SHORT READ: %d'' % len(pages)) + batch.append(pages) + + return (batchlen, ''''.join(batch)) + +def parsebatch(batch): + "parse a batch string into pages" + batchlen, batch = batch[:INTLEN], batch[INTLEN:] + batchlen, = struct.unpack(''i'', batchlen) + #print ''batch length: %d'' % batchlen + pfnlen = batchlen * LONGLEN + pfns = struct.unpack(''%dL'' % batchlen, batch[:pfnlen]) + pagebuf = batch[pfnlen:] + pages = [pagebuf[i*PAGE_SIZE:(i+1)*PAGE_SIZE] for i in xrange(batchlen)] + return zip(pfns, pages) diff --git a/tools/python/xen/remus/netlink.py b/tools/python/xen/remus/netlink.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/netlink.py @@ -0,0 +1,314 @@ +# netlink wrappers + +import socket, struct +import xen.lowlevel.netlink + +NETLINK_ROUTE = 0 + +NLM_F_REQUEST = 1 # It is request message. +NLM_F_MULTI = 2 # Multipart message, terminated by NLMSG_DONE +NLM_F_ACK = 4 # Reply with ack, with zero or error code +NLM_F_ECHO = 8 # Echo this request + +# Modifiers to GET request +NLM_F_ROOT = 0x100 # specify tree root +NLM_F_MATCH = 0x200 # return all matching +NLM_F_ATOMIC = 0x400 # atomic GET +NLM_F_DUMP = NLM_F_ROOT|NLM_F_MATCH + +# Modifiers to NEW request +NLM_F_REPLACE = 0x100 # Override existing +NLM_F_EXCL = 0x200 # Do not touch, if it exists +NLM_F_CREATE = 0x400 # Create, if it does not exist +NLM_F_APPEND = 0x800 # Add to end of list + +RTM_NEWLINK = 16 +RTM_GETLINK = 18 +RTM_NEWQDISC = 36 +RTM_DELQDISC = 37 +RTM_GETQDISC = 38 + +IFLA_UNSPEC = 0 +IFLA_ADDRESS = 1 +IFLA_BROADCAST = 2 +IFLA_IFNAME = 3 +IFLA_MTU = 4 +IFLA_LINK = 5 +IFLA_QDISC = 6 +IFLA_STATS = 7 +IFLA_COST = 8 +IFLA_PRIORITY = 9 +IFLA_MASTER = 10 +IFLA_WIRELESS = 11 +IFLA_PROTINFO = 12 +IFLA_TXQLEN = 13 +IFLA_MAP = 14 +IFLA_WEIGHT = 15 + +TCA_UNSPEC = 0 +TCA_KIND = 1 +TCA_OPTIONS = 2 +TCA_STATS = 3 +TCA_XSTATS = 4 +TCA_RATE = 5 +TCA_FCNT = 6 +TCA_STATS2 = 7 + +class RTNLException(Exception): pass + +def align(l, alignto=4): + return (l + alignto - 1) & ~(alignto - 1) + +class rtattr(object): + "rtattribute" + fmt = "HH" + fmtlen = struct.calcsize(fmt) + + def __init__(self, msg=None): + if msg: + self.unpack(msg) + else: + self.rta_len = 0 + self.rta_type = 0 + + self.body = '''' + + def __len__(self): + return align(self.rta_len) + + def pack(self): + self.rta_len = align(self.fmtlen + len(self.body)) + s = struct.pack(self.fmt, self.rta_len, self.rta_type) + self.body + pad = self.rta_len - len(s) + if pad: + s += ''\0'' * pad + return s + + def unpack(self, msg): + args = struct.unpack(self.fmt, msg[:self.fmtlen]) + self.rta_len, self.rta_type = args + + self.body = msg[align(self.fmtlen):self.rta_len] + +class rtattrlist(object): + def __init__(self, msg): + self.start = msg + + def __iter__(self): + body = self.start + while len(body) > rtattr.fmtlen: + rta = rtattr(body) + yield rta + body = body[len(rta):] + +class nlmsg(object): + "netlink message header" + fmt = "IHHII" + fmtlen = struct.calcsize(fmt) + + def __init__(self, msg=None): + if msg: + self.unpack(msg) + else: + self.nlmsg_len = 0 + self.nlmsg_type = 0 + self.nlmsg_flags = 0 + self.nlmsg_seq = 0 + self.nlmsg_pid = 0 + + self.rta = '''' + self.body = '''' + + def __len__(self): + return align(self.fmtlen + len(self.body) + len(self.rta)) + + def addattr(self, type, data): + attr = rtattr() + attr.rta_type = type + attr.body = data + self.rta += attr.pack() + + def settype(self, cmd): + self.nlmsg_type = cmd + + def pack(self): + return struct.pack(self.fmt, len(self), self.nlmsg_type, + self.nlmsg_flags, self.nlmsg_seq, + self.nlmsg_pid) + self.body + self.rta + + def unpack(self, msg): + args = struct.unpack(self.fmt, msg[:self.fmtlen]) + self.nlmsg_len, self.nlmsg_type, self.nlmsg_flags = args[:3] + self.nlmsg_seq, self.nlmsg_pid = args[3:] + + self.body = msg[align(self.fmtlen):] + self.rta = '''' + + def __str__(self): + return ''<netlink message, len %d, type %d>'' % \ + (self.nlmsg_len, self.nlmsg_type) + +class ifinfomsg(object): + "interface info message" + fmt = "BxHiII" + fmtlen = struct.calcsize(fmt) + + def __init__(self, msg=None): + if msg: + self.unpack(msg) + else: + self.ifi_family = 0 + self.ifi_type = 0 + self.ifi_index = 0 + self.ifi_flags = 0 + self.ifi_change = 0 + + self.body = '''' + + def unpack(self, msg): + args = struct.unpack(self.fmt, msg[:self.fmtlen]) + self.ifi_family, self.ifi_type, self.ifi_index= args[:3] + self.ifi_flags, self.ifi_change = args[3:] + + self.body = msg[align(self.fmtlen):] + + def __str__(self): + return ''<ifinfo message, family %d, type %d, index %d>'' % \ + (self.ifi_family, self.ifi_type, self.ifi_index) + +class tcmsg(object): + "TC message" + fmt = "BxxxiIII" + fmtlen = struct.calcsize(fmt) + + def __init__(self, msg=None): + if msg: + self.unpack(msg) + else: + self.tcm_family = socket.AF_UNSPEC + self.tcm_ifindex = 0 + self.tcm_handle = 0 + self.tcm_parent = 0 + self.tcm_info = 0 + + self.rta = '''' + + def unpack(self, msg): + args = struct.unpack(self.fmt, msg[:self.fmtlen]) + self.tcm_family, self.tcm_ifindex, self.tcm_handle = args[:3] + self.tcm_parent, self.tcm_info = args[3:] + + self.rta = msg[align(self.fmtlen):] + + def pack(self): + return struct.pack(self.fmt, self.tcm_family, self.tcm_ifindex, + self.tcm_handle, self.tcm_parent, self.tcm_info) + + def __str__(self): + return ''<tc message, family %d, index %d>'' % \ + (self.tcm_family, self.tcm_ifindex) + +class newlinkmsg(object): + def __init__(self, nlmsg): + if nlmsg.nlmsg_type != RTM_NEWLINK: + raise RTNLException("wrong message type") + self.nlmsg = nlmsg + self.ifi = ifinfomsg(self.nlmsg.body) + + self.rtattrs = {} + for rta in rtattrlist(self.ifi.body): + self.rtattrs[rta.rta_type] = rta.body + +class newqdiscmsg(object): + def __init__(self, nlmsg): + if nlmsg.nlmsg_type != RTM_NEWQDISC: + raise RTNLException("wrong message type") + self.nlmsg = nlmsg + self.t = tcmsg(self.nlmsg.body) + + self.rtattrs = {} + for rta in rtattrlist(self.t.rta): + self.rtattrs[rta.rta_type] = rta.body + +class rtnl(object): + def __init__(self): + self._rth = xen.lowlevel.netlink.rtnl() + self._linkcache = None + + def getlink(self, key, cached=False): + """returns the interface object corresponding to the key, which + may be an index number or device name.""" + if not cached: + self._linkcache = None + if self._linkcache is None: + self._linkcache = self.getlinks() + + if isinstance(key, int): + return self._linkcache.get(key) + + for k, v in self._linkcache.iteritems(): + if v[''name''] == key: + return v + + return None + + def getlinks(self): + """returns a dictionary of interfaces keyed by kernel + interface index""" + links = {} + def dumpfilter(addr, msgstr): + msg = newlinkmsg(nlmsg(msgstr)) + idx = msg.ifi.ifi_index + ifname = msg.rtattrs[IFLA_IFNAME].strip(''\0'') + address = msg.rtattrs.get(IFLA_ADDRESS) + + link = {''index'': idx, + ''type'': msg.ifi.ifi_type, + ''name'': ifname, + ''address'': address} + links[idx] = link + + self._rth.wilddump_request(socket.AF_UNSPEC, RTM_GETLINK) + self._rth.dump_filter(dumpfilter) + + return links + + def getqdisc(self, dev): + """returns the queueing discipline on device dev, which may be + specified by kernel index or device name""" + qdiscs = self.getqdiscs(dev) + if qdiscs: + return qdiscs.values()[0] + return None + + def getqdiscs(self, dev=None): + """returns a dictionary of queueing disciplines keyed by kernel + interface index""" + qdiscs = {} + def dumpfilter(addr, msgstr): + msg = newqdiscmsg(nlmsg(msgstr)) + idx = msg.t.tcm_ifindex + handle = msg.t.tcm_handle + kind = msg.rtattrs[TCA_KIND].strip(''\0'') + opts = msg.rtattrs.get(TCA_OPTIONS) + + qdisc = {''index'': idx, + ''handle'': handle, + ''kind'': kind, + ''options'': opts} + qdiscs[idx] = qdisc + + tcm = tcmsg() + if dev: + link = self.getlink(dev) + if not link: + raise QdiscException(''device %s not found'' % dev) + tcm.tcm_ifindex = link[''index''] + + msg = tcm.pack() + self._rth.dump_request(RTM_GETQDISC, msg) + self._rth.dump_filter(dumpfilter) + return qdiscs + + def talk(self, req): + self._rth.talk(req) diff --git a/tools/python/xen/remus/profile.py b/tools/python/xen/remus/profile.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/profile.py @@ -0,0 +1,56 @@ +"""Simple profiling module +""" + +import time + +class ProfileBlock(object): + """A section of code to be profiled""" + def __init__(self, name): + self.name = name + + def enter(self): + print "PROF: entered %s at %f" % (self.name, time.time()) + + def exit(self): + print "PROF: exited %s at %f" % (self.name, time.time()) + +class NullProfiler(object): + def enter(self, name): + pass + + def exit(self, name=None): + pass + +class Profiler(object): + def __init__(self): + self.blocks = {} + self.running = [] + + def enter(self, name): + try: + block = self.blocks[name] + except KeyError: + block = ProfileBlock(name) + self.blocks[name] = block + + block.enter() + self.running.append(block) + + def exit(self, name=None): + if name is not None: + block = None + while self.running: + tmp = self.running.pop() + if tmp.name == name: + block = tmp + break + tmp.exit() + if not block: + raise KeyError(''block %s not running'' % name) + else: + try: + block = self.running.pop() + except IndexError: + raise KeyError(''no block running'') + + block.exit() diff --git a/tools/python/xen/remus/qdisc.py b/tools/python/xen/remus/qdisc.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/qdisc.py @@ -0,0 +1,178 @@ +import socket, struct + +import netlink + +qdisc_kinds = {} + +TC_H_ROOT = 0xFFFFFFFF + +class QdiscException(Exception): pass + +class request(object): + "qdisc request message" + def __init__(self, cmd, flags=0, dev=None, handle=0): + self.n = netlink.nlmsg() + self.t = netlink.tcmsg() + + self.n.nlmsg_flags = netlink.NLM_F_REQUEST|flags + self.n.nlmsg_type = cmd + self.t.tcm_family = socket.AF_UNSPEC + + if not handle: + handle = TC_H_ROOT + self.t.tcm_parent = handle + + if dev: + self.t.tcm_ifindex = dev + + def pack(self): + t = self.t.pack() + self.n.body = t + return self.n.pack() + +class addrequest(request): + def __init__(self, dev, handle, qdisc): + flags = netlink.NLM_F_EXCL|netlink.NLM_F_CREATE + super(addrequest, self).__init__(netlink.RTM_NEWQDISC, flags=flags, + dev=dev, handle=handle) + self.n.addattr(netlink.TCA_KIND, qdisc.kind) + opts = qdisc.pack() + if opts: + self.n.addattr(netlink.TCA_OPTIONS, opts) + +class delrequest(request): + def __init__(self, dev, handle): + super(delrequest, self).__init__(netlink.RTM_DELQDISC, dev=dev, + handle=handle) + +class changerequest(request): + def __init__(self, dev, handle, qdisc): + super(changerequest, self).__init__(netlink.RTM_NEWQDISC, + dev=dev, handle=handle) + self.n.addattr(netlink.TCA_KIND, qdisc.kind) + opts = qdisc.pack() + if opts: + self.n.addattr(netlink.TCA_OPTIONS, opts) + +class Qdisc(object): + def __new__(cls, qdict=None, *args, **opts): + if qdict: + kind = qdict.get(''kind'') + cls = qdisc_kinds.get(kind, cls) + obj = super(Qdisc, cls).__new__(cls, qdict=qdict, *args, **opts) + return obj + + def __init__(self, qdict): + self._qdict = qdict + self.kind = qdict[''kind''] + self.handle = qdict[''handle''] >> 16 + + def parse(self, opts): + if opts: + raise QdiscException(''cannot parse qdisc parameters'') + + def optstr(self): + if self.qdict[''options'']: + return ''[cannot parse qdisc parameters]'' + else: + return '''' + + def pack(self): + return '''' + +TC_PRIO_MAX = 15 +class PrioQdisc(Qdisc): + fmt = ''i%sB'' % (TC_PRIO_MAX + 1) + + def __init__(self, qdict): + super(PrioQdisc, self).__init__(qdict) + + if qdict.get(''options''): + self.unpack(qdict[''options'']) + else: + self.bands = 3 + self.priomap = [1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + + def pack(self): + #return struct.pack(self.fmt, self.bands, *self.priomap) + return '''' + + def unpack(self, opts): + args = struct.unpack(self.fmt, opts) + self.bands = args[0] + self.priomap = args[1:] + + def optstr(self): + mapstr = '' ''.join([str(p) for p in self.priomap]) + return ''bands %d priomap %s'' % (self.bands, mapstr) + +qdisc_kinds[''prio''] = PrioQdisc +qdisc_kinds[''pfifo_fast''] = PrioQdisc + +class CfifoQdisc(Qdisc): + fmt = ''II'' + + def __init__(self, qdict): + super(CfifoQdisc, self).__init__(qdict) + + if qdict.get(''options''): + self.unpack(qdict[''options'']) + else: + self.epoch = 0 + self.vmid = 0 + + def pack(self): + return struct.pack(self.fmt, self.epoch, self.vmid) + + def unpack(self, opts): + self.epoch, self.vmid = struct.unpack(self.fmt, opts) + + def parse(self, opts): + args = list(opts) + try: + while args: + arg = args.pop(0) + if arg == ''epoch'': + self.epoch = int(args.pop(0)) + continue + if arg.lower() == ''vmid'': + self.vmid = int(args.pop(0)) + continue + except Exception, inst: + raise QdiscException(str(inst)) + + def optstr(self): + return ''epoch %d vmID %d'' % (self.epoch, self.vmid) + +qdisc_kinds[''cfifo''] = CfifoQdisc + +TC_QUEUE_CHECKPOINT = 0 +TC_QUEUE_RELEASE = 1 + +class QueueQdisc(Qdisc): + fmt = ''I'' + + def __init__(self, qdict=None): + if not qdict: + qdict = {''kind'': ''queue'', + ''handle'': TC_H_ROOT} + super(QueueQdisc, self).__init__(qdict) + + self.action = 0 + + def pack(self): + return struct.pack(self.fmt, self.action) + + def parse(self, args): + if not args: + raise QdiscException(''no action given'') + arg = args[0] + + if arg == ''checkpoint'': + self.action = TC_QUEUE_CHECKPOINT + elif arg == ''release'': + self.action = TC_QUEUE_RELEASE + else: + raise QdiscException(''unknown action'') + +qdisc_kinds[''queue''] = QueueQdisc diff --git a/tools/python/xen/remus/save.py b/tools/python/xen/remus/save.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/save.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python + +import os, select, socket, threading, time, signal, xmlrpclib + +from xen.xend.XendClient import server +from xen.xend.xenstore.xswatch import xswatch + +import xen.lowlevel.xc +from xen.xend.xenstore import xsutil +xc = xen.lowlevel.xc.xc() + +import xen.lowlevel.checkpoint + +import vm, image + +XCFLAGS_LIVE = 1 + +xcsave = ''/usr/lib/xen/bin/xc_save'' + +class _proxy(object): + "proxy simulates an object without inheritance" + def __init__(self, obj): + self._obj = obj + + def __getattr__(self, name): + return getattr(self._obj, name) + + def proxy(self, obj): + self._obj = obj + +class CheckpointError(Exception): pass + +class CheckpointingFile(_proxy): + """Tee writes into separate file objects for each round. + This is necessary because xc_save gets a single file descriptor + for the duration of checkpointing. + """ + def __init__(self, path): + self.path = path + + self.round = 0 + self.rfd, self.wfd = os.pipe() + self.fd = file(path, ''wb'') + + # this pipe is used to notify the writer thread of checkpoints + self.cprfd, self.cpwfd = os.pipe() + + super(CheckpointingFile, self).__init__(self.fd) + + wt = threading.Thread(target=self._wrthread, name=''disk-write-thread'') + wt.setDaemon(True) + wt.start() + self.wt = wt + + def fileno(self): + return self.wfd + + def close(self): + os.close(self.wfd) + # closing wfd should signal writer to stop + self.wt.join() + os.close(self.rfd) + os.close(self.cprfd) + os.close(self.cpwfd) + self.fd.close() + self.wt = None + + def checkpoint(self): + os.write(self.cpwfd, ''1'') + + def _wrthread(self): + while True: + r, o, e = select.select((self.rfd, self.cprfd), (), ()) + if self.rfd in r: + data = os.read(self.rfd, 256 * 1024) + if not data: + break + self.fd.write(data) + if self.cprfd in r: + junk = os.read(self.cprfd, 1) + self.round += 1 + self.fd = file(''%s.%d'' % (self.path, self.round), ''wb'') + self.proxy(self.fd) + +class MigrationSocket(_proxy): + def __init__(self, address): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect(address) + + sock.send("receive\n") + sock.recv(80) + + fd = os.fdopen(sock.fileno(), ''w+'') + + self.sock = sock + super(MigrationSocket, self).__init__(fd) + +class Keepalive(object): + "Call a keepalive method at intervals" + def __init__(self, method, interval=0.1): + self.keepalive = method + self.interval = interval + + self.thread = None + self.running = False + + def start(self): + if not self.interval: + return + self.thread = threading.Thread(target=self.run, name=''keepalive-thread'') + self.thread.setDaemon(True) + self.running = True + self.thread.start() + + def stop(self): + if not self.thread: + return + self.running = False + self.thread.join() + self.thread = None + + def run(self): + while self.running: + self.keepalive() + time.sleep(self.interval) + self.keepalive(stop=True) + +class Saver(object): + def __init__(self, domid, fd, suspendcb=None, resumecb=None, + checkpointcb=None, interval=0): + """Create a Saver object for taking guest checkpoints. + domid: name, number or UUID of a running domain + fd: a stream to which checkpoint data will be written. + suspendcb: callback invoked after guest is suspended + resumecb: callback invoked before guest resumes + checkpointcb: callback invoked when a checkpoint is complete. Return + True to take another checkpoint, or False to stop. + """ + self.fd = fd + self.suspendcb = suspendcb + self.resumecb = resumecb + self.checkpointcb = checkpointcb + self.interval = interval + + self.vm = vm.VM(domid) + + self.checkpointer = None + + def start(self): + vm.getshadowmem(self.vm) + + hdr = image.makeheader(self.vm.dominfo) + self.fd.write(hdr) + self.fd.flush() + + self.checkpointer = xen.lowlevel.checkpoint.checkpointer() + try: + self.checkpointer.open(self.vm.domid) + self.checkpointer.start(self.fd, self.suspendcb, self.resumecb, + self.checkpointcb, self.interval) + self.checkpointer.close() + except xen.lowlevel.checkpoint.error, e: + raise CheckpointError(e) + + def _resume(self): + """low-overhead version of XendDomainInfo.resumeDomain""" + # TODO: currently assumes SUSPEND_CANCEL is available + if True: + xc.domain_resume(self.vm.domid, 1) + xsutil.ResumeDomain(self.vm.domid) + else: + server.xend.domain.resumeDomain(self.vm.domid) diff --git a/tools/python/xen/remus/tapdisk.py b/tools/python/xen/remus/tapdisk.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/tapdisk.py @@ -0,0 +1,4 @@ +import blkdev + +class TapDisk(BlkDev): + pass diff --git a/tools/python/xen/remus/util.py b/tools/python/xen/remus/util.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/util.py @@ -0,0 +1,31 @@ +# utility functions + +import os, subprocess + +class PipeException(Exception): + def __init__(self, message, errno): + self.errno = errno + message = ''%s: %d, %s'' % (message, errno, os.strerror(errno)) + Exception.__init__(self, message) + +def canonifymac(mac): + return '':''.join([''%02x'' % int(field, 16) for field in mac.split('':'')]) + +def runcmd(args, cwd=None): + # TODO: stdin handling + if type(args) == str: + args = args.split('' '') + try: + proc = subprocess.Popen(args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, close_fds=True, + cwd=cwd) + stdout = proc.stdout.read() + stderr = proc.stderr.read() + proc.wait() + if proc.returncode: + print '' ''.join(args) + print stderr.strip() + raise PipeException(''%s failed'' % args[0], proc.returncode) + return stdout + except (OSError, IOError), inst: + raise PipeException(''could not run %s'' % args[0], inst.errno) diff --git a/tools/python/xen/remus/vbd.py b/tools/python/xen/remus/vbd.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/vbd.py @@ -0,0 +1,9 @@ +import blkdev + +class VBD(blkdev.BlkDev): + def handles(self, **props): + uname = props.get(''uname'', '''') + return uname.startswith(''phy:'') + handles = classmethod(handles) + +blkdev.register(VBD) diff --git a/tools/python/xen/remus/vdi.py b/tools/python/xen/remus/vdi.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/vdi.py @@ -0,0 +1,121 @@ +#code to play with vdis and snapshots + +import os + +def run(cmd): + fd = os.popen(cmd) + res = [l for l in fd if l.rstrip()] + return not fd.close(), res + + +_blockstore = ''/blockstore.dat'' + +def set_blockstore(blockstore): + global _blockstore + __blockstore = blockstore + + +class SnapShot: + def __init__(self, vdi, block, index): + self.__vdi = vdi + self.__block = block + self.__index = index + + #TODO add snapshot date and radix + + def __str__(self): + return ''%d %d %d'' % (self.__vdi.id(), self.__block, self.__index) + + def vdi(self): + return self.__vdi + + def block(self): + return self.__block + + def index(self): + return self.__index + + def match(self, block, index): + return self.__block == block and self.__index == index + + +class VDIException(Exception): + pass + + +class VDI: + def __init__(self, id, name): + self.__id = id + self.__name = name + + def __str__(self): + return ''vdi: %d %s'' % (self.__id, self.__name) + + def id(self): + return self.__id + + def name(self): + return self.__name + + def list_snapshots(self): + res, ls = run(''vdi_snap_list %s %d'' % (_blockstore, self.__id)) + if res: + return [SnapShot(self, int(l[0]), int(l[1])) for l in [l.split() for l in ls[1:]]] + else: + raise VDIException("Error reading snapshot list") + + def snapshot(self): + res, ls = run(''vdi_checkpoint %s %d'' % (_blockstore, self.__id)) + if res: + _, block, idx = ls[0].split() + return SnapShot(self, int(block), int(idx)) + else: + raise VDIException("Error taking vdi snapshot") + + +def create(name, snap): + res, _ = run(''vdi_create %s %s %d %d'' + % (_blockstore, name, snap.block(), snap.index())) + if res: + return lookup_by_name(name) + else: + raise VDIException(''Unable to create vdi from snapshot'') + + +def fill(name, img_file): + res, _ = run(''vdi_create %s %s'' % (_blockstore, name)) + + if res: + vdi = lookup_by_name(name) + res, _ = run(''vdi_fill %d %s'' % (vdi.id(), img_file)) + if res: + return vdi + raise VDIException(''Unable to create vdi from disk img file'') + + +def list_vdis(): + vdis = [] + res, lines = run(''vdi_list %s'' % _blockstore) + if res: + for l in lines: + r = l.split() + vdis.append(VDI(int(r[0]), r[1])) + return vdis + else: + raise VDIException("Error doing vdi list") + + +def lookup_by_id(id): + vdis = list_vdis() + for v in vdis: + if v.id() == id: + return v + raise VDIException("No match from vdi id") + + +def lookup_by_name(name): + vdis = list_vdis() + for v in vdis: + if v.name() == name: + return v + raise VDIException("No match for vdi name") diff --git a/tools/python/xen/remus/vif.py b/tools/python/xen/remus/vif.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/vif.py @@ -0,0 +1,14 @@ +from xen.remus.util import canonifymac + +class VIF(object): + def __init__(self, **props): + self.__dict__.update(props) + if ''mac'' in props: + self.mac = canonifymac(props[''mac'']) + + def __str__(self): + return self.mac + +def parse(props): + "turn a vm device dictionary into a vif object" + return VIF(**props) diff --git a/tools/python/xen/remus/vm.py b/tools/python/xen/remus/vm.py new file mode 100644 --- /dev/null +++ b/tools/python/xen/remus/vm.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python + +import xmlrpclib + +from xen.xend.XendClient import server +from xen.xend import sxp +# XXX XendDomain is voodoo to let balloon import succeed +from xen.xend import XendDomain, balloon + +import vif +import blkdev +# need a nicer way to load disk drivers +import vbd + +class VMException(Exception): pass + +class VM(object): + "Representation of a virtual machine" + def __init__(self, domid=None, dominfo=None): + self.dominfo = dominfo + + self.domid = -1 + self.name = ''unknown'' + self.dom = {} + self.disks = [] + self.vifs = [] + + if domid: + try: + self.dominfo = server.xend.domain(domid, ''all'') + except xmlrpclib.Fault: + raise VMException(''error looking up domain %s'' % str(domid)) + + if self.dominfo: + self.loaddominfo() + + def loaddominfo(self): + self.dom = parsedominfo(self.dominfo) + self.domid = self.dom[''domid''] + self.name = self.dom[''name''] + + self.disks = getdisks(self.dom) + self.vifs = getvifs(self.dom) + + def __str__(self): + return ''VM %d (%s), MACs: [%s], disks: [%s]'' % \ + (self.domid, self.name, self.epoch, '', ''.join(self.macs), + '', ''.join([str(d) for d in self.disks])) + +def parsedominfo(dominfo): + "parses a dominfo sexpression in the form of python lists of lists" + def s2d(s): + r = {} + for elem in s: + if len(elem) == 0: + continue + name = elem[0] + if len(elem) == 1: + val = None + else: + val = elem[1] + if isinstance(val, list): + val = s2d(elem[1:]) + if isinstance(name, list): + # hack for [''cpus'', [[1]]] + return s2d(elem) + if name in r: + for k, v in val.iteritems(): + if k in r[name]: + if not isinstance(r[name][k], list): + r[name][k] = [r[name][k]] + r[name][k].append(v) + else: + r[name][k] = v + else: + r[name] = val + return r + + return s2d(dominfo[1:]) + +def domtosxpr(dom): + "convert a dominfo into a python sxpr" + def d2s(d): + r = [] + for k, v in d.iteritems(): + elem = [k] + if isinstance(v, dict): + elem.extend(d2s(v)) + else: + if v is None: + v = '''' + elem.append(v) + r.append(elem) + return r + + sxpr = [''domain''] + sxpr.extend(d2s(dom)) + return sxpr + +def strtosxpr(s): + "convert a string to a python sxpr" + p = sxp.Parser() + p.input(s) + return p.get_val() + +def sxprtostr(sxpr): + "convert an sxpr to string" + return sxp.to_string(sxpr) + +def getvifs(dom): + "return vif objects for devices in dom" + vifs = dom[''device''].get(''vif'', []) + if type(vifs) != list: + vifs = [vifs] + + return [vif.parse(v) for v in vifs] + +def getdisks(dom): + "return block device objects for devices in dom" + disks = dom[''device''].get(''vbd'', []) + if type(disks) != list: + disks = [disks] + + # tapdisk1 devices + tap1s = dom[''device''].get(''tap'', []) + if type(tap1s) != list: + disks.append(tap1s) + else: + disks.extend(tap1s) + + # tapdisk2 devices + tap2s = dom[''device''].get(''tap2'', []) + if type(tap2s) != list: + disks.append(tap2s) + else: + disks.extend(tap2s) + + return [blkdev.parse(disk) for disk in disks] + +def fromxend(domid): + "create a VM object from xend information" + return VM(domid) + +def getshadowmem(vm): + "Balloon down domain0 to create free memory for shadow paging." + maxmem = int(vm.dom[''maxmem'']) + shadow = int(vm.dom[''shadow_memory'']) + vcpus = int(vm.dom[''vcpus'']) + + # from XendDomainInfo.checkLiveMigrateMemory: + # 1MB per vcpu plus 4Kib/Mib of RAM. This is higher than + # the minimum that Xen would allocate if no value were given. + needed = vcpus * 1024 + maxmem * 4 - shadow * 1024 + if needed > 0: + print "Freeing %d kB for shadow mode" % needed + balloon.free(needed, vm.dominfo) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Brendan Cully
2009-Nov-13 01:10 UTC
[Xen-devel] [PATCH 2 of 3] Remus: add control script to activate remus on a VM
# HG changeset patch # User Brendan Cully <brendan@cs.ubc.ca> # Date 1258074147 28800 # Node ID 4e36da19dc8f433910be8adabd8a3e4e5cead5d6 # Parent 213fb814acf431d2a382e8f9c09b4cea106c0958 Remus: add control script to activate remus on a VM Signed-off-by: Brendan Cully <brendan@cs.ubc.ca> diff --git a/tools/Makefile b/tools/Makefile --- a/tools/Makefile +++ b/tools/Makefile @@ -33,6 +33,7 @@ SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir SUBDIRS-y += xenpmd SUBDIRS-y += libxl +SUBDIRS-y += remus # These don''t cross-compile ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH)) diff --git a/tools/remus/Makefile b/tools/remus/Makefile new file mode 100644 --- /dev/null +++ b/tools/remus/Makefile @@ -0,0 +1,20 @@ +XEN_ROOT=../.. +include $(XEN_ROOT)/tools/Rules.mk + +SCRIPTS = remus + +.PHONY: all +all: build + +.PHONY: build +build: + echo "Nothing to do" + +.PHONY: install +install: + $(INSTALL_DIR) $(DESTDIR)$(BINDIR) + $(INSTALL_PYTHON_PROG) $(SCRIPTS) $(DESTDIR)$(BINDIR) + +.PHONY: clean +clean: + echo "Nothing to do" diff --git a/tools/remus/README b/tools/remus/README new file mode 100644 --- /dev/null +++ b/tools/remus/README @@ -0,0 +1,4 @@ +Remus provides fault tolerance for virtual machines by sending continuous +checkpoints to a backup, which will activate if the target VM fails. + +See the website at http://nss.cs.ubc.ca/remus/ for details. diff --git a/tools/remus/remus b/tools/remus/remus new file mode 100755 --- /dev/null +++ b/tools/remus/remus @@ -0,0 +1,362 @@ +#!/usr/bin/env python +# +# This is a save process which also buffers outgoing I/O between +# rounds, so that external viewers never see anything that hasn''t +# been committed at the backup +# +# TODO: fencing. + +import optparse, os, re, select, signal, sys, time +from xen.remus import save, vm +from xen.xend import XendOptions +from xen.remus import netlink, qdisc, util + +class CfgException(Exception): pass + +class Cfg(object): + def __init__(self): + # must be set + self.domid = 0 + + self.host = ''localhost'' + self.port = XendOptions.instance().get_xend_relocation_port() + self.interval = 200 + self.netbuffer = True + self.nobackup = False + self.timer = False + + parser = optparse.OptionParser() + parser.usage = ''%prog [options] domain [destination]'' + parser.add_option(''-i'', ''--interval'', dest=''interval'', type=''int'', + metavar=''MS'', + help=''checkpoint every MS milliseconds'') + parser.add_option(''-p'', ''--port'', dest=''port'', type=''int'', + help=''send stream to port PORT'', metavar=''PORT'') + parser.add_option('''', ''--no-net'', dest=''nonet'', action=''store_true'', + help=''run without net buffering (benchmark option)'') + parser.add_option('''', ''--timer'', dest=''timer'', action=''store_true'', + help=''force pause at checkpoint interval (experimental)'') + parser.add_option('''', ''--no-backup'', dest=''nobackup'', + action=''store_true'', + help=''prevent backup from starting up (benchmark '' + ''option)'') + self.parser = parser + + def usage(self): + self.parser.print_help() + + def getargs(self): + opts, args = self.parser.parse_args() + + if opts.interval: + self.interval = opts.interval + if opts.port: + self.port = opts.port + if opts.nonet: + self.netbuffer = False + if opts.timer: + self.timer = True + + if not args: + raise CfgException(''Missing domain'') + self.domid = args[0] + if (len(args) > 1): + self.host = args[1] + +class ReplicatedDiskException(Exception): pass + +class BufferedDevice(object): + ''Base class for buffered devices'' + + def postsuspend(self): + ''called after guest has suspended'' + pass + + def preresume(self): + ''called before guest resumes'' + pass + + def commit(self): + ''called when backup has acknowledged checkpoint reception'' + pass + +class ReplicatedDisk(BufferedDevice): + """ + Send a checkpoint message to a replicated disk while the domain + is paused between epochs. + """ + FIFODIR = ''/var/run/tap'' + + def __init__(self, disk): + # look up disk, make sure it is tap:buffer, and set up socket + # to request commits. + self.ctlfd = None + + if not disk.uname.startswith(''tap:remus:'') and not disk.uname.startswith(''tap:tapdisk:remus:''): + raise ReplicatedDiskException(''Disk is not replicated: %s'' % + str(disk)) + fifo = re.match("tap:.*(remus.*)\|", disk.uname).group(1).replace('':'', ''_'') + absfifo = os.path.join(self.FIFODIR, fifo) + absmsgfifo = absfifo + ''.msg'' + + self.installed = False + self.ctlfd = open(absfifo, ''w+b'') + self.msgfd = open(absmsgfifo, ''r+b'') + + def __del__(self): + self.uninstall() + + def setup(self): + #self.ctlfd.write(''buffer'') + #self.ctlfd.flush() + self.installed = True + + def uninstall(self): + if self.ctlfd: + self.ctlfd.close() + self.ctlfd = None + + def postsuspend(self): + if not self.installed: + self.setup() + + os.write(self.ctlfd.fileno(), ''flush'') + + def commit(self): + msg = os.read(self.msgfd.fileno(), 4) + if msg != ''done'': + print ''Unknown message: %s'' % msg + +class NetbufferException(Exception): pass + +class Netbuffer(BufferedDevice): + """ + Buffer a protected domain''s network output between rounds so that + nothing is issued that a failover might not know about. + """ + # shared rtnetlink handle + rth = None + + def __init__(self, domid): + self.installed = False + + if not self.rth: + self.rth = netlink.rtnl() + + self.devname = self._startimq(domid) + dev = self.rth.getlink(self.devname) + if not dev: + raise NetbufferException(''could not find device %s'' % self.devname) + self.dev = dev[''index''] + self.handle = qdisc.TC_H_ROOT + self.q = qdisc.QueueQdisc() + + def __del__(self): + self.uninstall() + + def postsuspend(self): + if not self.installed: + self._setup() + + self._sendqmsg(qdisc.TC_QUEUE_CHECKPOINT) + + def commit(self): + ''''''Called when checkpoint has been acknowledged by + the backup'''''' + self._sendqmsg(qdisc.TC_QUEUE_RELEASE) + + def _sendqmsg(self, action): + self.q.action = action + req = qdisc.changerequest(self.dev, self.handle, self.q) + self.rth.talk(req.pack()) + + def _setup(self): + q = self.rth.getqdisc(self.dev) + if q: + if q[''kind''] == ''queue'': + self.installed = True + return + if q[''kind''] != ''pfifo_fast'': + raise NetbufferException(''there is already a queueing '' + ''discipline on %s'' % self.devname) + + print ''installing buffer on %s'' % self.devname + req = qdisc.addrequest(self.dev, self.handle, self.q) + self.rth.talk(req.pack()) + self.installed = True + + def uninstall(self): + if self.installed: + req = qdisc.delrequest(self.dev, self.handle) + self.rth.talk(req.pack()) + self.installed = False + + def _startimq(self, domid): + # stopgap hack to set up IMQ for an interface. Wrong in many ways. + imqebt = ''/usr/lib/xen/bin/imqebt'' + imqdev = ''imq0'' + vid = ''vif%d.0'' % domid + for mod in [''sch_queue'', ''imq'', ''ebt_imq'']: + util.runcmd([''modprobe'', mod]) + util.runcmd("ip link set %s up" % (imqdev)) + util.runcmd("ebtables -F FORWARD") + util.runcmd("ebtables -A FORWARD -i %s -j imq --todev %s" % (vid, imqdev)) + + return imqdev + +class SignalException(Exception): pass + +def run(cfg): + closure = lambda: None + closure.cmd = None + + def sigexception(signo, frame): + raise SignalException(signo) + + def die(): + # I am not sure what the best way to die is. xm destroy is another option, + # or we could attempt to trigger some instant reboot. + print "dying..." + print util.runcmd([''sudo'', ''ifdown'', ''eth2'']) + # dangling imq0 handle on vif locks up the system + for buf in bufs: + buf.uninstall() + print util.runcmd([''sudo'', ''xm'', ''destroy'', cfg.domid]) + print util.runcmd([''sudo'', ''ifup'', ''eth2'']) + + def getcommand(): + """Get a command to execute while running. + Commands include: + s: die prior to postsuspend hook + s2: die after postsuspend hook + r: die prior to preresume hook + r2: die after preresume hook + c: die prior to commit hook + c2: die after commit hook + """ + r, w, x = select.select([sys.stdin], [], [], 0) + if sys.stdin not in r: + return + + cmd = sys.stdin.readline().strip() + if cmd not in (''s'', ''s2'', ''r'', ''r2'', ''c'', ''c2''): + print "unknown command: %s" % cmd + closure.cmd = cmd + + signal.signal(signal.SIGTERM, sigexception) + + dom = vm.VM(cfg.domid) + + # set up I/O buffers + bufs = [] + + # disks must commit before network can be released + for disk in dom.disks: + try: + bufs.append(ReplicatedDisk(disk)) + except ReplicatedDiskException, e: + print e + continue + + if cfg.netbuffer: + for vif in dom.vifs: + bufs.append(Netbuffer(dom.domid)) + + fd = save.MigrationSocket((cfg.host, cfg.port)) + + def postsuspend(): + ''Begin external checkpointing after domain has paused'' + if not cfg.timer: + # when not using a timer thread, sleep until now + interval + closure.starttime = time.time() + + if closure.cmd == ''s'': + die() + + for buf in bufs: + buf.postsuspend() + + if closure.cmd == ''s2'': + die() + + def preresume(): + ''Complete external checkpointing before domain resumes'' + if closure.cmd == ''r'': + die() + + for buf in bufs: + buf.preresume() + + if closure.cmd == ''r2'': + die() + + def commit(): + ''commit network buffer'' + if closure.cmd == ''c'': + die() + + print >> sys.stderr, "PROF: flushed memory at %0.6f" % (time.time()) + + for buf in bufs: + buf.commit() + + if closure.cmd == ''c2'': + die() + + # Since the domain is running at this point, it''s a good time to + # check for control channel commands + getcommand() + + if not cfg.timer: + endtime = time.time() + elapsed = (endtime - closure.starttime) * 1000 + + if elapsed < cfg.interval: + time.sleep((cfg.interval - elapsed) / 1000.0) + + # False ends checkpointing + return True + + if cfg.timer: + interval = cfg.interval + else: + interval = 0 + + rc = 0 + + checkpointer = save.Saver(cfg.domid, fd, postsuspend, preresume, commit, + interval) + + try: + checkpointer.start() + except save.CheckpointError, e: + print e + rc = 1 + except KeyboardInterrupt: + pass + except SignalException: + print ''*** signalled ***'' + + for buf in bufs: + buf.uninstall() + + if cfg.nobackup: + # lame attempt to kill backup if protection is stopped deliberately. + # It would be much better to move this into the heartbeat "protocol". + print util.runcmd([''sudo'', ''-u'', os.getlogin(), ''ssh'', cfg.host, ''sudo'', ''xm'', ''destroy'', dom.name]) + + sys.exit(rc) + +cfg = Cfg() +try: + cfg.getargs() +except CfgException, inst: + print str(inst) + cfg.usage() + sys.exit(1) + +try: + run(cfg) +except vm.VMException, inst: + print str(inst) + sys.exit(1) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Brendan Cully
2009-Nov-13 01:10 UTC
[Xen-devel] [PATCH 3 of 3] Remus: support for network buffering
# HG changeset patch # User Brendan Cully <brendan@cs.ubc.ca> # Date 1258074170 28800 # Node ID 6caed9eff54375d6fb561dab0ea1213e37e98339 # Parent 4e36da19dc8f433910be8adabd8a3e4e5cead5d6 Remus: support for network buffering This currently relies on the third-party IMQ patch (linuximq.net) being present in dom0. The plan is to replace this with a direct hook into netback eventually. This patch includes a pared-down and patched copy of ebtables to install IMQ on a VIF. Signed-off-by: Brendan Cully <brendan@cs.ubc.ca> diff --git a/.hgignore b/.hgignore --- a/.hgignore +++ b/.hgignore @@ -205,6 +205,8 @@ ^tools/pygrub/build/.*$ ^tools/python/build/.*$ ^tools/python/xen/util/path\.py$ +^tools/remus/imqebt/imqebt$ +^tools/remus/kmod/.*(\.cmd|\.mod|\.ko|\.mod\.c|\.symvers|\.xen)$ ^tools/security/secpol_tool$ ^tools/security/xen/.*$ ^tools/security/xensec_tool$ diff --git a/tools/remus/Makefile b/tools/remus/Makefile --- a/tools/remus/Makefile +++ b/tools/remus/Makefile @@ -1,20 +1,17 @@ XEN_ROOT=../.. include $(XEN_ROOT)/tools/Rules.mk +SUBDIRS-y := imqebt kmod + SCRIPTS = remus .PHONY: all -all: build - -.PHONY: build -build: - echo "Nothing to do" +all: subdirs-all .PHONY: install -install: +install: subdirs-install $(INSTALL_DIR) $(DESTDIR)$(BINDIR) $(INSTALL_PYTHON_PROG) $(SCRIPTS) $(DESTDIR)$(BINDIR) .PHONY: clean -clean: - echo "Nothing to do" +clean: subdirs-clean diff --git a/tools/remus/imqebt/Makefile b/tools/remus/imqebt/Makefile new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/Makefile @@ -0,0 +1,97 @@ +# ebtables Makefile (reworked for Remus IMQ control) + +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk + +PROGNAME:=ebtables +PROGRELEASE:=1 +PROGVERSION_:=2.0.9 +PROGVERSION:=$(PROGVERSION_)-$(PROGRELEASE) +PROGDATE:=June\ 2009 + +ifeq ($(shell uname -m),sparc64) +CFLAGS+=-DEBT_MIN_ALIGN=8 -DKERNEL_64_USERSPACE_32 +endif + +include extensions/Makefile + +OBJECTS2:=getethertype.o communication.o libebtc.o \ +useful_functions.o ebtables.o + +OBJECTS:=$(OBJECTS2) $(EXT_OBJS) $(EXT_LIBS) + +KERNEL_INCLUDES?=include/ + +ETHERTYPESPATH?=$(ETCDIR) +ETHERTYPESFILE:=$(ETHERTYPESPATH)/ethertypes + +PIPE_DIR?=/tmp/$(PROGNAME)-v$(PROGVERSION) +PIPE=$(PIPE_DIR)/ebtablesd_pipe +EBTD_CMDLINE_MAXLN?=2048 +EBTD_ARGC_MAX?=50 + +PROGSPECS:=-DPROGVERSION=\"$(PROGVERSION)\" \ + -DPROGNAME=\"$(PROGNAME)\" \ + -DPROGDATE=\"$(PROGDATE)\" \ + -D_PATH_ETHERTYPES=\"$(ETHERTYPESFILE)\" \ + -DEBTD_ARGC_MAX=$(EBTD_ARGC_MAX) \ + -DEBTD_CMDLINE_MAXLN=$(EBTD_CMDLINE_MAXLN) + +# Uncomment for debugging (slower) +#PROGSPECS+=-DEBT_DEBUG +#CFLAGS+=-ggdb + +PROGRAMS = imqebt + +.PHONY: all +all: build + +.PHONY: build +build: $(PROGRAMS) + +# a little scripting for a static binary, making one for ebtables-restore +# should be completely analogous +imqebt: extensions/ebt_*.c extensions/ebtable_*.c ebtables.c communication.c ebtables-standalone.c getethertype.c libebtc.c useful_functions.c + cp ebtables-standalone.c ebtables-standalone.c_ ; \ + cp include/ebtables_u.h include/ebtables_u.h_ ; \ + sed "s/ main(/ pseudomain(/" ebtables-standalone.c > ebtables-standalone.c__ ; \ + mv ebtables-standalone.c__ ebtables-standalone.c ; \ + printf "\nint main(int argc, char *argv[])\n{\n " >> ebtables-standalone.c ; \ + for arg in $(EXT_FUNC) \ + ; do \ + sed s/_init/_$${arg}_init/ extensions/ebt_$${arg}.c > extensions/ebt_$${arg}.c_ ; \ + mv extensions/ebt_$${arg}.c_ extensions/ebt_$${arg}.c ; \ + printf "\t%s();\n" _$${arg}_init >> ebtables-standalone.c ; \ + printf "extern void %s(void);\n" _$${arg}_init >> include/ebtables_u.h ; \ + done ; \ + for arg in $(EXT_TABLES) \ + ; do \ + sed s/_init/_t_$${arg}_init/ extensions/ebtable_$${arg}.c > extensions/ebtable_$${arg}.c_ ; \ + mv extensions/ebtable_$${arg}.c_ extensions/ebtable_$${arg}.c ; \ + printf "\t%s();\n" _t_$${arg}_init >> ebtables-standalone.c ; \ + printf "extern void %s(void);\n" _t_$${arg}_init >> include/ebtables_u.h ; \ + done ; \ + printf "\n\tpseudomain(argc, argv);\n\treturn 0;\n}\n" >> ebtables-standalone.c ;\ + $(CC) $(CFLAGS) $(PROGSPECS) -o $@ $^ -I$(KERNEL_INCLUDES) -Iinclude ; \ + for arg in $(EXT_FUNC) \ + ; do \ + sed "s/ .*_init/ _init/" extensions/ebt_$${arg}.c > extensions/ebt_$${arg}.c_ ; \ + mv extensions/ebt_$${arg}.c_ extensions/ebt_$${arg}.c ; \ + done ; \ + for arg in $(EXT_TABLES) \ + ; do \ + sed "s/ .*_init/ _init/" extensions/ebtable_$${arg}.c > extensions/ebtable_$${arg}.c_ ; \ + mv extensions/ebtable_$${arg}.c_ extensions/ebtable_$${arg}.c ; \ + done ; \ + mv ebtables-standalone.c_ ebtables-standalone.c ; \ + mv include/ebtables_u.h_ include/ebtables_u.h + +.PHONY: install +install: build + $(INSTALL_DIR) $(DESTDIR)$(PRIVATE_BINDIR) + $(INSTALL_PROG) $(PROGRAMS) $(DESTDIR)$(PRIVATE_BINDIR) + +.PHONY: clean +clean: + rm -f imqebt + rm -f *.o *~ *.so diff --git a/tools/remus/imqebt/README b/tools/remus/imqebt/README new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/README @@ -0,0 +1,2 @@ +This is a fork of ebtables for installing IMQ on a bridged device. +Like the original code, it is released under the GPL. \ No newline at end of file diff --git a/tools/remus/imqebt/communication.c b/tools/remus/imqebt/communication.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/communication.c @@ -0,0 +1,762 @@ +/* + * communication.c, v2.0 July 2002 + * + * Author: Bart De Schuymer + * + */ + +/* + * All the userspace/kernel communication is in this file. + * The other code should not have to know anything about the way the + * kernel likes the structure of the table data. + * The other code works with linked lists. So, the translation is done here. + */ + +#include <getopt.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/socket.h> +#include "include/ebtables_u.h" + +extern char* hooknames[NF_BR_NUMHOOKS]; + +#ifdef KERNEL_64_USERSPACE_32 +#define sparc_cast (uint64_t) +#else +#define sparc_cast +#endif + +int sockfd = -1; + +static int get_sockfd(void) +{ + int ret = 0; + if (sockfd == -1) { + sockfd = socket(AF_INET, SOCK_RAW, PF_INET); + if (sockfd < 0) { + ebt_print_error("Problem getting a socket, " + "you probably don''t have the right " + "permissions"); + ret = -1; + } + } + return ret; +} + +static struct ebt_replace *translate_user2kernel(struct ebt_u_replace *u_repl) +{ + struct ebt_replace *new; + struct ebt_u_entry *e; + struct ebt_u_match_list *m_l; + struct ebt_u_watcher_list *w_l; + struct ebt_u_entries *entries; + char *p, *base; + int i, j; + unsigned int entries_size = 0, *chain_offsets; + + new = (struct ebt_replace *)malloc(sizeof(struct ebt_replace)); + if (!new) + ebt_print_memory(); + new->valid_hooks = u_repl->valid_hooks; + strcpy(new->name, u_repl->name); + new->nentries = u_repl->nentries; + new->num_counters = u_repl->num_counters; + new->counters = sparc_cast u_repl->counters; + chain_offsets = (unsigned int *)malloc(u_repl->num_chains * sizeof(unsigned int)); + /* Determine size */ + for (i = 0; i < u_repl->num_chains; i++) { + if (!(entries = u_repl->chains[i])) + continue; + chain_offsets[i] = entries_size; + entries_size += sizeof(struct ebt_entries); + j = 0; + e = entries->entries->next; + while (e != entries->entries) { + j++; + entries_size += sizeof(struct ebt_entry); + m_l = e->m_list; + while (m_l) { + entries_size += m_l->m->match_size + + sizeof(struct ebt_entry_match); + m_l = m_l->next; + } + w_l = e->w_list; + while (w_l) { + entries_size += w_l->w->watcher_size + + sizeof(struct ebt_entry_watcher); + w_l = w_l->next; + } + entries_size += e->t->target_size + + sizeof(struct ebt_entry_target); + e = e->next; + } + /* A little sanity check */ + if (j != entries->nentries) + ebt_print_bug("Wrong nentries: %d != %d, hook = %s", j, + entries->nentries, entries->name); + } + + new->entries_size = entries_size; + p = (char *)malloc(entries_size); + if (!p) + ebt_print_memory(); + + /* Put everything in one block */ + new->entries = sparc_cast p; + for (i = 0; i < u_repl->num_chains; i++) { + struct ebt_entries *hlp; + + hlp = (struct ebt_entries *)p; + if (!(entries = u_repl->chains[i])) + continue; + if (i < NF_BR_NUMHOOKS) + new->hook_entry[i] = sparc_cast hlp; + hlp->nentries = entries->nentries; + hlp->policy = entries->policy; + strcpy(hlp->name, entries->name); + hlp->counter_offset = entries->counter_offset; + hlp->distinguisher = 0; /* Make the kernel see the light */ + p += sizeof(struct ebt_entries); + e = entries->entries->next; + while (e != entries->entries) { + struct ebt_entry *tmp = (struct ebt_entry *)p; + + tmp->bitmask = e->bitmask | EBT_ENTRY_OR_ENTRIES; + tmp->invflags = e->invflags; + tmp->ethproto = e->ethproto; + strcpy(tmp->in, e->in); + strcpy(tmp->out, e->out); + strcpy(tmp->logical_in, e->logical_in); + strcpy(tmp->logical_out, e->logical_out); + memcpy(tmp->sourcemac, e->sourcemac, + sizeof(tmp->sourcemac)); + memcpy(tmp->sourcemsk, e->sourcemsk, + sizeof(tmp->sourcemsk)); + memcpy(tmp->destmac, e->destmac, sizeof(tmp->destmac)); + memcpy(tmp->destmsk, e->destmsk, sizeof(tmp->destmsk)); + + base = p; + p += sizeof(struct ebt_entry); + m_l = e->m_list; + while (m_l) { + memcpy(p, m_l->m, m_l->m->match_size + + sizeof(struct ebt_entry_match)); + p += m_l->m->match_size + + sizeof(struct ebt_entry_match); + m_l = m_l->next; + } + tmp->watchers_offset = p - base; + w_l = e->w_list; + while (w_l) { + memcpy(p, w_l->w, w_l->w->watcher_size + + sizeof(struct ebt_entry_watcher)); + p += w_l->w->watcher_size + + sizeof(struct ebt_entry_watcher); + w_l = w_l->next; + } + tmp->target_offset = p - base; + memcpy(p, e->t, e->t->target_size + + sizeof(struct ebt_entry_target)); + if (!strcmp(e->t->u.name, EBT_STANDARD_TARGET)) { + struct ebt_standard_target *st + (struct ebt_standard_target *)p; + /* Translate the jump to a udc */ + if (st->verdict >= 0) + st->verdict = chain_offsets + [st->verdict + NF_BR_NUMHOOKS]; + } + p += e->t->target_size + + sizeof(struct ebt_entry_target); + tmp->next_offset = p - base; + e = e->next; + } + } + + /* Sanity check */ + if (p - (char *)new->entries != new->entries_size) + ebt_print_bug("Entries_size bug"); + free(chain_offsets); + return new; +} + +static void store_table_in_file(char *filename, struct ebt_replace *repl) +{ + char *data; + int size; + int fd; + + /* Start from an empty file with right priviliges */ + if (!(fd = creat(filename, 0600))) { + ebt_print_error("Couldn''t create file %s", filename); + return; + } + + size = sizeof(struct ebt_replace) + repl->entries_size + + repl->nentries * sizeof(struct ebt_counter); + data = (char *)malloc(size); + if (!data) + ebt_print_memory(); + memcpy(data, repl, sizeof(struct ebt_replace)); + memcpy(data + sizeof(struct ebt_replace), (char *)repl->entries, + repl->entries_size); + /* Initialize counters to zero, deliver_counters() can update them */ + memset(data + sizeof(struct ebt_replace) + repl->entries_size, + 0, repl->nentries * sizeof(struct ebt_counter)); + if (write(fd, data, size) != size) + ebt_print_error("Couldn''t write everything to file %s", + filename); + close(fd); + free(data); +} + +void ebt_deliver_table(struct ebt_u_replace *u_repl) +{ + socklen_t optlen; + struct ebt_replace *repl; + + /* Translate the struct ebt_u_replace to a struct ebt_replace */ + repl = translate_user2kernel(u_repl); + if (u_repl->filename != NULL) { + store_table_in_file(u_repl->filename, repl); + goto free_repl; + } + /* Give the data to the kernel */ + optlen = sizeof(struct ebt_replace) + repl->entries_size; + if (get_sockfd()) + goto free_repl; + if (!setsockopt(sockfd, IPPROTO_IP, EBT_SO_SET_ENTRIES, repl, optlen)) + goto free_repl; + if (u_repl->command == 8) { /* The ebtables module may not + * yet be loaded with --atomic-commit */ + ebtables_insmod("ebtables"); + if (!setsockopt(sockfd, IPPROTO_IP, EBT_SO_SET_ENTRIES, + repl, optlen)) + goto free_repl; + } + + ebt_print_error("The kernel doesn''t support a certain ebtables" + " extension, consider recompiling your kernel or insmod" + " the extension"); +free_repl: + if (repl) { + free(repl->entries); + free(repl); + } +} + +static int store_counters_in_file(char *filename, struct ebt_u_replace *repl) +{ + int size = repl->nentries * sizeof(struct ebt_counter), ret = 0; + unsigned int entries_size; + struct ebt_replace hlp; + FILE *file; + + if (!(file = fopen(filename, "r+b"))) { + ebt_print_error("Could not open file %s", filename); + return -1; + } + /* Find out entries_size and then set the file pointer to the + * counters */ + if (fseek(file, (char *)(&hlp.entries_size) - (char *)(&hlp), SEEK_SET) + || fread(&entries_size, sizeof(char), sizeof(unsigned int), file) !+ sizeof(unsigned int) || + fseek(file, entries_size + sizeof(struct ebt_replace), SEEK_SET)) { + ebt_print_error("File %s is corrupt", filename); + ret = -1; + goto close_file; + } + if (fwrite(repl->counters, sizeof(char), size, file) != size) { + ebt_print_error("Could not write everything to file %s", + filename); + ret = -1; + } +close_file: + fclose(file); + return 0; +} + +/* Gets executed after ebt_deliver_table. Delivers the counters to the kernel + * and resets the counterchanges to CNT_NORM */ +void ebt_deliver_counters(struct ebt_u_replace *u_repl) +{ + struct ebt_counter *old, *new, *newcounters; + socklen_t optlen; + struct ebt_replace repl; + struct ebt_cntchanges *cc = u_repl->cc->next, *cc2; + struct ebt_u_entries *entries = NULL; + struct ebt_u_entry *next = NULL; + int i, chainnr = 0; + + if (u_repl->nentries == 0) + return; + + newcounters = (struct ebt_counter *) + malloc(u_repl->nentries * sizeof(struct ebt_counter)); + if (!newcounters) + ebt_print_memory(); + memset(newcounters, 0, u_repl->nentries * sizeof(struct ebt_counter)); + old = u_repl->counters; + new = newcounters; + while (cc != u_repl->cc) { + if (!next || next == entries->entries) { + while (chainnr < u_repl->num_chains && (!(entries = u_repl->chains[chainnr++]) || + (next = entries->entries->next) == entries->entries)); + if (chainnr == u_repl->num_chains) + break; + } + if (cc->type == CNT_NORM) { + /* ''Normal'' rule, meaning we didn''t do anything to it + * So, we just copy */ + *new = *old; + next->cnt = *new; + next->cnt_surplus.pcnt = next->cnt_surplus.bcnt = 0; + old++; /* We''ve used an old counter */ + new++; /* We''ve set a new counter */ + next = next->next; + } else if (cc->type == CNT_DEL) { + old++; /* Don''t use this old counter */ + } else { + if (cc->type == CNT_CHANGE) { + if (cc->change % 3 == 1) + new->pcnt = old->pcnt + next->cnt_surplus.pcnt; + else if (cc->change % 3 == 2) + new->pcnt = old->pcnt - next->cnt_surplus.pcnt; + else + new->pcnt = next->cnt.pcnt; + if (cc->change / 3 == 1) + new->bcnt = old->bcnt + next->cnt_surplus.bcnt; + else if (cc->change / 3 == 2) + new->bcnt = old->bcnt - next->cnt_surplus.bcnt; + else + new->bcnt = next->cnt.bcnt; + } else + *new = next->cnt; + next->cnt = *new; + next->cnt_surplus.pcnt = next->cnt_surplus.bcnt = 0; + if (cc->type == CNT_ADD) + new++; + else { + old++; + new++; + } + next = next->next; + } + cc = cc->next; + } + + free(u_repl->counters); + u_repl->counters = newcounters; + u_repl->num_counters = u_repl->nentries; + /* Reset the counterchanges to CNT_NORM and delete the unused cc */ + i = 0; + cc = u_repl->cc->next; + while (cc != u_repl->cc) { + if (cc->type == CNT_DEL) { + cc->prev->next = cc->next; + cc->next->prev = cc->prev; + cc2 = cc->next; + free(cc); + cc = cc2; + } else { + cc->type = CNT_NORM; + cc->change = 0; + i++; + cc = cc->next; + } + } + if (i != u_repl->nentries) + ebt_print_bug("i != u_repl->nentries"); + if (u_repl->filename != NULL) { + store_counters_in_file(u_repl->filename, u_repl); + return; + } + optlen = u_repl->nentries * sizeof(struct ebt_counter) + + sizeof(struct ebt_replace); + /* Now put the stuff in the kernel''s struct ebt_replace */ + repl.counters = sparc_cast u_repl->counters; + repl.num_counters = u_repl->num_counters; + memcpy(repl.name, u_repl->name, sizeof(repl.name)); + + if (get_sockfd()) + return; + if (setsockopt(sockfd, IPPROTO_IP, EBT_SO_SET_COUNTERS, &repl, optlen)) + ebt_print_bug("Couldn''t update kernel counters"); +} + +static int +ebt_translate_match(struct ebt_entry_match *m, struct ebt_u_match_list ***l) +{ + struct ebt_u_match_list *new; + int ret = 0; + + new = (struct ebt_u_match_list *) + malloc(sizeof(struct ebt_u_match_list)); + if (!new) + ebt_print_memory(); + new->m = (struct ebt_entry_match *) + malloc(m->match_size + sizeof(struct ebt_entry_match)); + if (!new->m) + ebt_print_memory(); + memcpy(new->m, m, m->match_size + sizeof(struct ebt_entry_match)); + new->next = NULL; + **l = new; + *l = &new->next; + if (ebt_find_match(new->m->u.name) == NULL) { + ebt_print_error("Kernel match %s unsupported by userspace tool", + new->m->u.name); + ret = -1; + } + return ret; +} + +static int +ebt_translate_watcher(struct ebt_entry_watcher *w, + struct ebt_u_watcher_list ***l) +{ + struct ebt_u_watcher_list *new; + int ret = 0; + + new = (struct ebt_u_watcher_list *) + malloc(sizeof(struct ebt_u_watcher_list)); + if (!new) + ebt_print_memory(); + new->w = (struct ebt_entry_watcher *) + malloc(w->watcher_size + sizeof(struct ebt_entry_watcher)); + if (!new->w) + ebt_print_memory(); + memcpy(new->w, w, w->watcher_size + sizeof(struct ebt_entry_watcher)); + new->next = NULL; + **l = new; + *l = &new->next; + if (ebt_find_watcher(new->w->u.name) == NULL) { + ebt_print_error("Kernel watcher %s unsupported by userspace " + "tool", new->w->u.name); + ret = -1; + } + return ret; +} + +static int +ebt_translate_entry(struct ebt_entry *e, int *hook, int *n, int *cnt, + int *totalcnt, struct ebt_u_entry **u_e, struct ebt_u_replace *u_repl, + unsigned int valid_hooks, char *base, struct ebt_cntchanges **cc) +{ + /* An entry */ + if (e->bitmask & EBT_ENTRY_OR_ENTRIES) { + struct ebt_u_entry *new; + struct ebt_u_match_list **m_l; + struct ebt_u_watcher_list **w_l; + struct ebt_entry_target *t; + + new = (struct ebt_u_entry *)malloc(sizeof(struct ebt_u_entry)); + if (!new) + ebt_print_memory(); + new->bitmask = e->bitmask; + /* + * Plain userspace code doesn''t know about + * EBT_ENTRY_OR_ENTRIES + */ + new->bitmask &= ~EBT_ENTRY_OR_ENTRIES; + new->invflags = e->invflags; + new->ethproto = e->ethproto; + strcpy(new->in, e->in); + strcpy(new->out, e->out); + strcpy(new->logical_in, e->logical_in); + strcpy(new->logical_out, e->logical_out); + memcpy(new->sourcemac, e->sourcemac, sizeof(new->sourcemac)); + memcpy(new->sourcemsk, e->sourcemsk, sizeof(new->sourcemsk)); + memcpy(new->destmac, e->destmac, sizeof(new->destmac)); + memcpy(new->destmsk, e->destmsk, sizeof(new->destmsk)); + if (*totalcnt >= u_repl->nentries) + ebt_print_bug("*totalcnt >= u_repl->nentries"); + new->cnt = u_repl->counters[*totalcnt]; + new->cnt_surplus.pcnt = new->cnt_surplus.bcnt = 0; + new->cc = *cc; + *cc = (*cc)->next; + new->m_list = NULL; + new->w_list = NULL; + new->next = (*u_e)->next; + new->next->prev = new; + (*u_e)->next = new; + new->prev = *u_e; + *u_e = new; + m_l = &new->m_list; + EBT_MATCH_ITERATE(e, ebt_translate_match, &m_l); + w_l = &new->w_list; + EBT_WATCHER_ITERATE(e, ebt_translate_watcher, &w_l); + + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + new->t = (struct ebt_entry_target *) + malloc(t->target_size + sizeof(struct ebt_entry_target)); + if (!new->t) + ebt_print_memory(); + if (ebt_find_target(t->u.name) == NULL) { + ebt_print_error("Kernel target %s unsupported by " + "userspace tool", t->u.name); + return -1; + } + memcpy(new->t, t, t->target_size + + sizeof(struct ebt_entry_target)); + /* Deal with jumps to udc */ + if (!strcmp(t->u.name, EBT_STANDARD_TARGET)) { + char *tmp = base; + int verdict = ((struct ebt_standard_target *)t)->verdict; + int i; + + if (verdict >= 0) { + tmp += verdict; + for (i = NF_BR_NUMHOOKS; i < u_repl->num_chains; i++) + if (u_repl->chains[i]->kernel_start == tmp) + break; + if (i == u_repl->num_chains) + ebt_print_bug("Can''t find udc for jump"); + ((struct ebt_standard_target *)new->t)->verdict = i-NF_BR_NUMHOOKS; + } + } + + (*cnt)++; + (*totalcnt)++; + return 0; + } else { /* A new chain */ + int i; + struct ebt_entries *entries = (struct ebt_entries *)e; + + if (*n != *cnt) + ebt_print_bug("Nr of entries in the chain is wrong"); + *n = entries->nentries; + *cnt = 0; + for (i = *hook + 1; i < NF_BR_NUMHOOKS; i++) + if (valid_hooks & (1 << i)) + break; + *hook = i; + *u_e = u_repl->chains[*hook]->entries; + return 0; + } +} + +/* Initialize all chain headers */ +static int +ebt_translate_chains(struct ebt_entry *e, int *hook, + struct ebt_u_replace *u_repl, unsigned int valid_hooks) +{ + int i; + struct ebt_entries *entries = (struct ebt_entries *)e; + struct ebt_u_entries *new; + + if (!(e->bitmask & EBT_ENTRY_OR_ENTRIES)) { + for (i = *hook + 1; i < NF_BR_NUMHOOKS; i++) + if (valid_hooks & (1 << i)) + break; + new = (struct ebt_u_entries *)malloc(sizeof(struct ebt_u_entries)); + if (!new) + ebt_print_memory(); + if (i == u_repl->max_chains) + ebt_double_chains(u_repl); + u_repl->chains[i] = new; + if (i >= NF_BR_NUMHOOKS) + new->kernel_start = (char *)e; + *hook = i; + new->nentries = entries->nentries; + new->policy = entries->policy; + new->entries = (struct ebt_u_entry *)malloc(sizeof(struct ebt_u_entry)); + if (!new->entries) + ebt_print_memory(); + new->entries->next = new->entries->prev = new->entries; + new->counter_offset = entries->counter_offset; + strcpy(new->name, entries->name); + } + return 0; +} + +static int retrieve_from_file(char *filename, struct ebt_replace *repl, + char command) +{ + FILE *file; + char *hlp = NULL, *entries; + struct ebt_counter *counters; + int size, ret = 0; + + if (!(file = fopen(filename, "r+b"))) { + ebt_print_error("Could not open file %s", filename); + return -1; + } + /* Make sure table name is right if command isn''t -L or --atomic-commit */ + if (command != ''L'' && command != 8) { + hlp = (char *)malloc(strlen(repl->name) + 1); + if (!hlp) + ebt_print_memory(); + strcpy(hlp, repl->name); + } + if (fread(repl, sizeof(char), sizeof(struct ebt_replace), file) + != sizeof(struct ebt_replace)) { + ebt_print_error("File %s is corrupt", filename); + ret = -1; + goto close_file; + } + if (command != ''L'' && command != 8 && strcmp(hlp, repl->name)) { + ebt_print_error("File %s contains wrong table name or is " + "corrupt", filename); + ret = -1; + goto close_file; + } else if (!ebt_find_table(repl->name)) { + ebt_print_error("File %s contains invalid table name", + filename); + ret = -1; + goto close_file; + } + + size = sizeof(struct ebt_replace) + + repl->nentries * sizeof(struct ebt_counter) + repl->entries_size; + fseek(file, 0, SEEK_END); + if (size != ftell(file)) { + ebt_print_error("File %s has wrong size", filename); + ret = -1; + goto close_file; + } + entries = (char *)malloc(repl->entries_size); + if (!entries) + ebt_print_memory(); + repl->entries = sparc_cast entries; + if (repl->nentries) { + counters = (struct ebt_counter *) + malloc(repl->nentries * sizeof(struct ebt_counter)); + repl->counters = sparc_cast counters; + if (!repl->counters) + ebt_print_memory(); + } else + repl->counters = sparc_cast NULL; + /* Copy entries and counters */ + if (fseek(file, sizeof(struct ebt_replace), SEEK_SET) || + fread((char *)repl->entries, sizeof(char), repl->entries_size, file) + != repl->entries_size || + fseek(file, sizeof(struct ebt_replace) + repl->entries_size, + SEEK_SET) + || fread((char *)repl->counters, sizeof(char), + repl->nentries * sizeof(struct ebt_counter), file) + != repl->nentries * sizeof(struct ebt_counter)) { + ebt_print_error("File %s is corrupt", filename); + free(entries); + repl->entries = NULL; + ret = -1; + } +close_file: + fclose(file); + free(hlp); + return ret; +} + +static int retrieve_from_kernel(struct ebt_replace *repl, char command, + int init) +{ + socklen_t optlen; + int optname; + char *entries; + + optlen = sizeof(struct ebt_replace); + if (get_sockfd()) + return -1; + /* --atomic-init || --init-table */ + if (init) + optname = EBT_SO_GET_INIT_INFO; + else + optname = EBT_SO_GET_INFO; + if (getsockopt(sockfd, IPPROTO_IP, optname, repl, &optlen)) + return -1; + + if ( !(entries = (char *)malloc(repl->entries_size)) ) + ebt_print_memory(); + repl->entries = sparc_cast entries; + if (repl->nentries) { + struct ebt_counter *counters; + + if (!(counters = (struct ebt_counter *) + malloc(repl->nentries * sizeof(struct ebt_counter))) ) + ebt_print_memory(); + repl->counters = sparc_cast counters; + } + else + repl->counters = sparc_cast NULL; + + /* We want to receive the counters */ + repl->num_counters = repl->nentries; + optlen += repl->entries_size + repl->num_counters * + sizeof(struct ebt_counter); + if (init) + optname = EBT_SO_GET_INIT_ENTRIES; + else + optname = EBT_SO_GET_ENTRIES; + if (getsockopt(sockfd, IPPROTO_IP, optname, repl, &optlen)) + ebt_print_bug("Hmm, what is wrong??? bug#1"); + + return 0; +} + +int ebt_get_table(struct ebt_u_replace *u_repl, int init) +{ + int i, j, k, hook; + struct ebt_replace repl; + struct ebt_u_entry *u_e = NULL; + struct ebt_cntchanges *new_cc, *cc; + + strcpy(repl.name, u_repl->name); + if (u_repl->filename != NULL) { + if (init) + ebt_print_bug("Getting initial table data from a file is impossible"); + if (retrieve_from_file(u_repl->filename, &repl, u_repl->command)) + return -1; + /* -L with a wrong table name should be dealt with silently */ + strcpy(u_repl->name, repl.name); + } else if (retrieve_from_kernel(&repl, u_repl->command, init)) + return -1; + + /* Translate the struct ebt_replace to a struct ebt_u_replace */ + u_repl->valid_hooks = repl.valid_hooks; + u_repl->nentries = repl.nentries; + u_repl->num_counters = repl.num_counters; + u_repl->counters = repl.counters; + u_repl->cc = (struct ebt_cntchanges *)malloc(sizeof(struct ebt_cntchanges)); + if (!u_repl->cc) + ebt_print_memory(); + u_repl->cc->next = u_repl->cc->prev = u_repl->cc; + cc = u_repl->cc; + for (i = 0; i < repl.nentries; i++) { + new_cc = (struct ebt_cntchanges *)malloc(sizeof(struct ebt_cntchanges)); + if (!new_cc) + ebt_print_memory(); + new_cc->type = CNT_NORM; + new_cc->change = 0; + new_cc->prev = cc; + cc->next = new_cc; + cc = new_cc; + } + if (repl.nentries) { + new_cc->next = u_repl->cc; + u_repl->cc->prev = new_cc; + } + u_repl->chains = (struct ebt_u_entries **)calloc(EBT_ORI_MAX_CHAINS, sizeof(void *)); + u_repl->max_chains = EBT_ORI_MAX_CHAINS; + hook = -1; + /* FIXME: Clean up when an error is encountered */ + EBT_ENTRY_ITERATE(repl.entries, repl.entries_size, ebt_translate_chains, + &hook, u_repl, u_repl->valid_hooks); + if (hook >= NF_BR_NUMHOOKS) + u_repl->num_chains = hook + 1; + else + u_repl->num_chains = NF_BR_NUMHOOKS; + i = 0; /* Holds the expected nr. of entries for the chain */ + j = 0; /* Holds the up to now counted entries for the chain */ + k = 0; /* Holds the total nr. of entries, should equal u_repl->nentries afterwards */ + cc = u_repl->cc->next; + hook = -1; + EBT_ENTRY_ITERATE((char *)repl.entries, repl.entries_size, + ebt_translate_entry, &hook, &i, &j, &k, &u_e, u_repl, + u_repl->valid_hooks, (char *)repl.entries, &cc); + if (k != u_repl->nentries) + ebt_print_bug("Wrong total nentries"); + free(repl.entries); + return 0; +} diff --git a/tools/remus/imqebt/ebtables-standalone.c b/tools/remus/imqebt/ebtables-standalone.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/ebtables-standalone.c @@ -0,0 +1,14 @@ +#include <string.h> +#include "include/ebtables_u.h" + +static struct ebt_u_replace replace; +void ebt_early_init_once(); + +int main(int argc, char *argv[]) +{ + ebt_silent = 0; + ebt_early_init_once(); + strcpy(replace.name, "filter"); + do_command(argc, argv, EXEC_STYLE_PRG, &replace); + return 0; +} diff --git a/tools/remus/imqebt/ebtables.c b/tools/remus/imqebt/ebtables.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/ebtables.c @@ -0,0 +1,1233 @@ +/* + * ebtables.c, v2.0 July 2002 + * + * Author: Bart De Schuymer + * + * This code was stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty'' Russell & Michael J. Neuling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <getopt.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <inttypes.h> +#include "include/ebtables_u.h" +#include "include/ethernetdb.h" + +/* Checks whether a command has already been specified */ +#define OPT_COMMANDS (replace->flags & OPT_COMMAND || replace->flags & OPT_ZERO) + +#define OPT_COMMAND 0x01 +#define OPT_TABLE 0x02 +#define OPT_IN 0x04 +#define OPT_OUT 0x08 +#define OPT_JUMP 0x10 +#define OPT_PROTOCOL 0x20 +#define OPT_SOURCE 0x40 +#define OPT_DEST 0x80 +#define OPT_ZERO 0x100 +#define OPT_LOGICALIN 0x200 +#define OPT_LOGICALOUT 0x400 +#define OPT_KERNELDATA 0x800 /* This value is also defined in ebtablesd.c */ +#define OPT_COUNT 0x1000 /* This value is also defined in libebtc.c */ +#define OPT_CNT_INCR 0x2000 /* This value is also defined in libebtc.c */ +#define OPT_CNT_DECR 0x4000 /* This value is also defined in libebtc.c */ + +/* Default command line options. Do not mess around with the already + * assigned numbers unless you know what you are doing */ +static struct option ebt_original_options[] +{ + { "append" , required_argument, 0, ''A'' }, + { "insert" , required_argument, 0, ''I'' }, + { "delete" , required_argument, 0, ''D'' }, + { "list" , optional_argument, 0, ''L'' }, + { "Lc" , no_argument , 0, 4 }, + { "Ln" , no_argument , 0, 5 }, + { "Lx" , no_argument , 0, 6 }, + { "Lmac2" , no_argument , 0, 12 }, + { "zero" , optional_argument, 0, ''Z'' }, + { "flush" , optional_argument, 0, ''F'' }, + { "policy" , required_argument, 0, ''P'' }, + { "in-interface" , required_argument, 0, ''i'' }, + { "in-if" , required_argument, 0, ''i'' }, + { "logical-in" , required_argument, 0, 2 }, + { "logical-out" , required_argument, 0, 3 }, + { "out-interface" , required_argument, 0, ''o'' }, + { "out-if" , required_argument, 0, ''o'' }, + { "version" , no_argument , 0, ''V'' }, + { "help" , no_argument , 0, ''h'' }, + { "jump" , required_argument, 0, ''j'' }, + { "set-counters" , required_argument, 0, ''c'' }, + { "change-counters", required_argument, 0, ''C'' }, + { "proto" , required_argument, 0, ''p'' }, + { "protocol" , required_argument, 0, ''p'' }, + { "db" , required_argument, 0, ''b'' }, + { "source" , required_argument, 0, ''s'' }, + { "src" , required_argument, 0, ''s'' }, + { "destination" , required_argument, 0, ''d'' }, + { "dst" , required_argument, 0, ''d'' }, + { "table" , required_argument, 0, ''t'' }, + { "modprobe" , required_argument, 0, ''M'' }, + { "new-chain" , required_argument, 0, ''N'' }, + { "rename-chain" , required_argument, 0, ''E'' }, + { "delete-chain" , optional_argument, 0, ''X'' }, + { "atomic-init" , no_argument , 0, 7 }, + { "atomic-commit" , no_argument , 0, 8 }, + { "atomic-file" , required_argument, 0, 9 }, + { "atomic-save" , no_argument , 0, 10 }, + { "init-table" , no_argument , 0, 11 }, + { 0 } +}; + +static struct option *ebt_options = ebt_original_options; + +/* Holds all the data */ +static struct ebt_u_replace *replace; + +/* The chosen table */ +static struct ebt_u_table *table; + +/* The pointers in here are special: + * The struct ebt_target pointer is actually a struct ebt_u_target pointer. + * I do not feel like using a union. + * We need a struct ebt_u_target pointer because we know the address of the data + * they point to won''t change. We want to allow that the struct ebt_u_target.t + * member can change. + * The same holds for the struct ebt_match and struct ebt_watcher pointers */ +static struct ebt_u_entry *new_entry; + + +static int global_option_offset; +#define OPTION_OFFSET 256 +static struct option *merge_options(struct option *oldopts, + const struct option *newopts, unsigned int *options_offset) +{ + unsigned int num_old, num_new, i; + struct option *merge; + + if (!newopts || !oldopts || !options_offset) + ebt_print_bug("merge wrong"); + for (num_old = 0; oldopts[num_old].name; num_old++); + for (num_new = 0; newopts[num_new].name; num_new++); + + global_option_offset += OPTION_OFFSET; + *options_offset = global_option_offset; + + merge = malloc(sizeof(struct option) * (num_new + num_old + 1)); + if (!merge) + ebt_print_memory(); + memcpy(merge, oldopts, num_old * sizeof(struct option)); + for (i = 0; i < num_new; i++) { + merge[num_old + i] = newopts[i]; + merge[num_old + i].val += *options_offset; + } + memset(merge + num_old + num_new, 0, sizeof(struct option)); + /* Only free dynamically allocated stuff */ + if (oldopts != ebt_original_options) + free(oldopts); + + return merge; +} + +static void merge_match(struct ebt_u_match *m) +{ + ebt_options = merge_options + (ebt_options, m->extra_ops, &(m->option_offset)); +} + +static void merge_watcher(struct ebt_u_watcher *w) +{ + ebt_options = merge_options + (ebt_options, w->extra_ops, &(w->option_offset)); +} + +static void merge_target(struct ebt_u_target *t) +{ + ebt_options = merge_options + (ebt_options, t->extra_ops, &(t->option_offset)); +} + +/* Be backwards compatible, so don''t use ''+'' in kernel */ +#define IF_WILDCARD 1 +static void print_iface(const char *iface) +{ + char *c; + + if ((c = strchr(iface, IF_WILDCARD))) + *c = ''+''; + printf("%s ", iface); + if (c) + *c = IF_WILDCARD; +} + +/* We use replace->flags, so we can''t use the following values: + * 0x01 == OPT_COMMAND, 0x02 == OPT_TABLE, 0x100 == OPT_ZERO */ +#define LIST_N 0x04 +#define LIST_C 0x08 +#define LIST_X 0x10 +#define LIST_MAC2 0x20 + +/* Helper function for list_rules() */ +static void list_em(struct ebt_u_entries *entries) +{ + int i, j, space = 0, digits; + struct ebt_u_entry *hlp; + struct ebt_u_match_list *m_l; + struct ebt_u_watcher_list *w_l; + struct ebt_u_match *m; + struct ebt_u_watcher *w; + struct ebt_u_target *t; + + if (replace->flags & LIST_MAC2) + ebt_printstyle_mac = 2; + else + ebt_printstyle_mac = 0; + hlp = entries->entries->next; + if (replace->flags & LIST_X && entries->policy != EBT_ACCEPT) { + printf("ebtables -t %s -P %s %s\n", replace->name, + entries->name, ebt_standard_targets[-entries->policy - 1]); + } else if (!(replace->flags & LIST_X)) { + printf("\nBridge chain: %s, entries: %d, policy: %s\n", + entries->name, entries->nentries, + ebt_standard_targets[-entries->policy - 1]); + } + + if (replace->flags & LIST_N) { + i = entries->nentries; + while (i > 9) { + space++; + i /= 10; + } + } + + for (i = 0; i < entries->nentries; i++) { + if (replace->flags & LIST_N) { + digits = 0; + /* A little work to get nice rule numbers. */ + j = i + 1; + while (j > 9) { + digits++; + j /= 10; + } + for (j = 0; j < space - digits; j++) + printf(" "); + printf("%d. ", i + 1); + } + if (replace->flags & LIST_X) + printf("ebtables -t %s -A %s ", + replace->name, entries->name); + + /* The standard target''s print() uses this to find out + * the name of a udc */ + hlp->replace = replace; + + /* Don''t print anything about the protocol if no protocol was + * specified, obviously this means any protocol will do. */ + if (!(hlp->bitmask & EBT_NOPROTO)) { + printf("-p "); + if (hlp->invflags & EBT_IPROTO) + printf("! "); + if (hlp->bitmask & EBT_802_3) + printf("Length "); + else { + struct ethertypeent *ent; + + ent = getethertypebynumber(ntohs(hlp->ethproto)); + if (!ent) + printf("0x%x ", ntohs(hlp->ethproto)); + else + printf("%s ", ent->e_name); + } + } + if (hlp->bitmask & EBT_SOURCEMAC) { + printf("-s "); + if (hlp->invflags & EBT_ISOURCE) + printf("! "); + ebt_print_mac_and_mask(hlp->sourcemac, hlp->sourcemsk); + printf(" "); + } + if (hlp->bitmask & EBT_DESTMAC) { + printf("-d "); + if (hlp->invflags & EBT_IDEST) + printf("! "); + ebt_print_mac_and_mask(hlp->destmac, hlp->destmsk); + printf(" "); + } + if (hlp->in[0] != ''\0'') { + printf("-i "); + if (hlp->invflags & EBT_IIN) + printf("! "); + print_iface(hlp->in); + } + if (hlp->logical_in[0] != ''\0'') { + printf("--logical-in "); + if (hlp->invflags & EBT_ILOGICALIN) + printf("! "); + print_iface(hlp->logical_in); + } + if (hlp->logical_out[0] != ''\0'') { + printf("--logical-out "); + if (hlp->invflags & EBT_ILOGICALOUT) + printf("! "); + print_iface(hlp->logical_out); + } + if (hlp->out[0] != ''\0'') { + printf("-o "); + if (hlp->invflags & EBT_IOUT) + printf("! "); + print_iface(hlp->out); + } + + m_l = hlp->m_list; + while (m_l) { + m = ebt_find_match(m_l->m->u.name); + if (!m) + ebt_print_bug("Match not found"); + m->print(hlp, m_l->m); + m_l = m_l->next; + } + w_l = hlp->w_list; + while (w_l) { + w = ebt_find_watcher(w_l->w->u.name); + if (!w) + ebt_print_bug("Watcher not found"); + w->print(hlp, w_l->w); + w_l = w_l->next; + } + + printf("-j "); + if (strcmp(hlp->t->u.name, EBT_STANDARD_TARGET)) + printf("%s ", hlp->t->u.name); + t = ebt_find_target(hlp->t->u.name); + if (!t) + ebt_print_bug("Target ''%s'' not found", hlp->t->u.name); + t->print(hlp, hlp->t); + if (replace->flags & LIST_C) { + uint64_t pcnt = hlp->cnt.pcnt; + uint64_t bcnt = hlp->cnt.bcnt; + + if (replace->flags & LIST_X) + printf("-c %llu %llu", pcnt, bcnt); + else + printf(", pcnt = %"PRIu64" -- bcnt = %"PRIu64, pcnt, bcnt); + } + printf("\n"); + hlp = hlp->next; + } +} + +static void print_help(void) +{ + struct ebt_u_match_list *m_l; + struct ebt_u_watcher_list *w_l; + + PRINT_VERSION; + printf( +"Usage:\n" +"ebtables -[ADI] chain rule-specification [options]\n" +"ebtables -P chain target\n" +"ebtables -[LFZ] [chain]\n" +"ebtables -[NX] [chain]\n" +"ebtables -E old-chain-name new-chain-name\n\n" +"Commands:\n" +"--append -A chain : append to chain\n" +"--delete -D chain : delete matching rule from chain\n" +"--delete -D chain rulenum : delete rule at position rulenum from chain\n" +"--change-counters -C chain\n" +" [rulenum] pcnt bcnt : change counters of existing rule\n" +"--insert -I chain rulenum : insert rule at position rulenum in chain\n" +"--list -L [chain] : list the rules in a chain or in all chains\n" +"--flush -F [chain] : delete all rules in chain or in all chains\n" +"--init-table : replace the kernel table with the initial table\n" +"--zero -Z [chain] : put counters on zero in chain or in all chains\n" +"--policy -P chain target : change policy on chain to target\n" +"--new-chain -N chain : create a user defined chain\n" +"--rename-chain -E old new : rename a chain\n" +"--delete-chain -X [chain] : delete a user defined chain\n" +"--atomic-commit : update the kernel w/t table contained in <FILE>\n" +"--atomic-init : put the initial kernel table into <FILE>\n" +"--atomic-save : put the current kernel table into <FILE>\n" +"--atomic-file file : set <FILE> to file\n\n" +"Options:\n" +"--proto -p [!] proto : protocol hexadecimal, by name or LENGTH\n" +"--src -s [!] address[/mask]: source mac address\n" +"--dst -d [!] address[/mask]: destination mac address\n" +"--in-if -i [!] name[+] : network input interface name\n" +"--out-if -o [!] name[+] : network output interface name\n" +"--logical-in [!] name[+] : logical bridge input interface name\n" +"--logical-out [!] name[+] : logical bridge output interface name\n" +"--set-counters -c chain\n" +" pcnt bcnt : set the counters of the to be added rule\n" +"--modprobe -M program : try to insert modules using this program\n" +"--version -V : print package version\n\n" +"Environment variable:\n" +ATOMIC_ENV_VARIABLE " : if set <FILE> (see above) will equal its value" +"\n\n"); + m_l = new_entry->m_list; + while (m_l) { + ((struct ebt_u_match *)m_l->m)->help(); + printf("\n"); + m_l = m_l->next; + } + w_l = new_entry->w_list; + while (w_l) { + ((struct ebt_u_watcher *)w_l->w)->help(); + printf("\n"); + w_l = w_l->next; + } + ((struct ebt_u_target *)new_entry->t)->help(); + printf("\n"); + if (table->help) + table->help(ebt_hooknames); +} + +/* Execute command L */ +static void list_rules(void) +{ + int i; + + if (!(replace->flags & LIST_X)) + printf("Bridge table: %s\n", table->name); + if (replace->selected_chain != -1) + list_em(ebt_to_chain(replace)); + else { + /* Create new chains and rename standard chains when necessary */ + if (replace->flags & LIST_X && replace->num_chains > NF_BR_NUMHOOKS) { + for (i = NF_BR_NUMHOOKS; i < replace->num_chains; i++) + printf("ebtables -t %s -N %s\n", replace->name, replace->chains[i]->name); + for (i = 0; i < NF_BR_NUMHOOKS; i++) + if (replace->chains[i] && strcmp(replace->chains[i]->name, ebt_hooknames[i])) + printf("ebtables -t %s -E %s %s\n", replace->name, ebt_hooknames[i], replace->chains[i]->name); + } + for (i = 0; i < replace->num_chains; i++) + if (replace->chains[i]) + list_em(replace->chains[i]); + } +} + +static int parse_rule_range(const char *argv, int *rule_nr, int *rule_nr_end) +{ + char *colon = strchr(argv, '':''), *buffer; + + if (colon) { + *colon = ''\0''; + if (*(colon + 1) == ''\0'') + *rule_nr_end = -1; /* Until the last rule */ + else { + *rule_nr_end = strtol(colon + 1, &buffer, 10); + if (*buffer != ''\0'' || *rule_nr_end == 0) + return -1; + } + } + if (colon == argv) + *rule_nr = 1; /* Beginning with the first rule */ + else { + *rule_nr = strtol(argv, &buffer, 10); + if (*buffer != ''\0'' || *rule_nr == 0) + return -1; + } + if (!colon) + *rule_nr_end = *rule_nr; + return 0; +} + +/* Incrementing or decrementing rules in daemon mode is not supported as the + * involved code overload is not worth it (too annoying to take the increased + * counters in the kernel into account). */ +static int parse_change_counters_rule(int argc, char **argv, int *rule_nr, int *rule_nr_end, int exec_style) +{ + char *buffer; + int ret = 0; + + if (optind + 1 >= argc || (argv[optind][0] == ''-'' && (argv[optind][1] < ''0'' || argv[optind][1] > ''9'')) || + (argv[optind + 1][0] == ''-'' && (argv[optind + 1][1] < ''0'' && argv[optind + 1][1] > ''9''))) + ebt_print_error2("The command -C needs at least 2 arguments"); + if (optind + 2 < argc && (argv[optind + 2][0] != ''-'' || (argv[optind + 2][1] >= ''0'' && argv[optind + 2][1] <= ''9''))) { + if (optind + 3 != argc) + ebt_print_error2("No extra options allowed with -C start_nr[:end_nr] pcnt bcnt"); + if (parse_rule_range(argv[optind], rule_nr, rule_nr_end)) + ebt_print_error2("Something is wrong with the rule number specification ''%s''", argv[optind]); + optind++; + } + + if (argv[optind][0] == ''+'') { + if (exec_style == EXEC_STYLE_DAEMON) +daemon_incr: + ebt_print_error2("Incrementing rule counters (%s) not allowed in daemon mode", argv[optind]); + ret += 1; + new_entry->cnt_surplus.pcnt = strtoull(argv[optind] + 1, &buffer, 10); + } else if (argv[optind][0] == ''-'') { + if (exec_style == EXEC_STYLE_DAEMON) +daemon_decr: + ebt_print_error2("Decrementing rule counters (%s) not allowed in daemon mode", argv[optind]); + ret += 2; + new_entry->cnt_surplus.pcnt = strtoull(argv[optind] + 1, &buffer, 10); + } else + new_entry->cnt_surplus.pcnt = strtoull(argv[optind], &buffer, 10); + + if (*buffer != ''\0'') + goto invalid; + optind++; + if (argv[optind][0] == ''+'') { + if (exec_style == EXEC_STYLE_DAEMON) + goto daemon_incr; + ret += 3; + new_entry->cnt_surplus.bcnt = strtoull(argv[optind] + 1, &buffer, 10); + } else if (argv[optind][0] == ''-'') { + if (exec_style == EXEC_STYLE_DAEMON) + goto daemon_decr; + ret += 6; + new_entry->cnt_surplus.bcnt = strtoull(argv[optind] + 1, &buffer, 10); + } else + new_entry->cnt_surplus.bcnt = strtoull(argv[optind], &buffer, 10); + + if (*buffer != ''\0'') + goto invalid; + optind++; + return ret; +invalid: + ebt_print_error2("Packet counter ''%s'' invalid", argv[optind]); +} + +static int parse_iface(char *iface, char *option) +{ + char *c; + + if ((c = strchr(iface, ''+''))) { + if (*(c + 1) != ''\0'') { + ebt_print_error("Spurious characters after ''+'' wildcard for ''%s''", option); + return -1; + } else + *c = IF_WILDCARD; + } + return 0; +} + +void ebt_early_init_once(void) +{ + ebt_iterate_matches(merge_match); + ebt_iterate_watchers(merge_watcher); + ebt_iterate_targets(merge_target); +} + +/* We use exec_style instead of #ifdef''s because ebtables.so is a shared object. */ +int do_command(int argc, char *argv[], int exec_style, + struct ebt_u_replace *replace_) +{ + char *buffer; + int c, i; + int zerochain = -1; /* Needed for the -Z option (we can have -Z <this> -L <that>) */ + int chcounter = 0; /* Needed for -C */ + int policy = 0; + int rule_nr = 0; + int rule_nr_end = 0; + struct ebt_u_target *t; + struct ebt_u_match *m; + struct ebt_u_watcher *w; + struct ebt_u_match_list *m_l; + struct ebt_u_watcher_list *w_l; + struct ebt_u_entries *entries; + + opterr = 0; + ebt_modprobe = NULL; + + replace = replace_; + + /* The daemon doesn''t use the environment variable */ + if (exec_style == EXEC_STYLE_PRG) { + buffer = getenv(ATOMIC_ENV_VARIABLE); + if (buffer) { + replace->filename = malloc(strlen(buffer) + 1); + if (!replace->filename) + ebt_print_memory(); + strcpy(replace->filename, buffer); + buffer = NULL; + } + } + + replace->flags &= OPT_KERNELDATA; /* ebtablesd needs OPT_KERNELDATA */ + replace->selected_chain = -1; + replace->command = ''h''; + + if (!new_entry) { + new_entry = (struct ebt_u_entry *)malloc(sizeof(struct ebt_u_entry)); + if (!new_entry) + ebt_print_memory(); + } + /* Put some sane values in our new entry */ + ebt_initialize_entry(new_entry); + new_entry->replace = replace; + + /* The scenario induced by this loop makes that: + * ''-t'' ,''-M'' and --atomic (if specified) have to come + * before ''-A'' and the like */ + + /* Getopt saves the day */ + while ((c = getopt_long(argc, argv, + "-A:D:C:I:N:E:X::L::Z::F::P:Vhi:o:j:c:p:s:d:t:M:", ebt_options, NULL)) != -1) { + switch (c) { + + case ''A'': /* Add a rule */ + case ''D'': /* Delete a rule */ + case ''C'': /* Change counters */ + case ''P'': /* Define policy */ + case ''I'': /* Insert a rule */ + case ''N'': /* Make a user defined chain */ + case ''E'': /* Rename chain */ + case ''X'': /* Delete chain */ + /* We allow -N chainname -P policy */ + if (replace->command == ''N'' && c == ''P'') { + replace->command = c; + optind--; /* No table specified */ + goto handle_P; + } + if (OPT_COMMANDS) + ebt_print_error2("Multiple commands are not allowed"); + + replace->command = c; + replace->flags |= OPT_COMMAND; + if (!(replace->flags & OPT_KERNELDATA)) + ebt_get_kernel_table(replace, 0); + if (optarg && (optarg[0] == ''-'' || !strcmp(optarg, "!"))) + ebt_print_error2("No chain name specified"); + if (c == ''N'') { + if (ebt_get_chainnr(replace, optarg) != -1) + ebt_print_error2("Chain %s already exists", optarg); + else if (ebt_find_target(optarg)) + ebt_print_error2("Target with name %s exists", optarg); + else if (strlen(optarg) >= EBT_CHAIN_MAXNAMELEN) + ebt_print_error2("Chain name length can''t exceed %d", + EBT_CHAIN_MAXNAMELEN - 1); + else if (strchr(optarg, '' '') != NULL) + ebt_print_error2("Use of '' '' not allowed in chain names"); + ebt_new_chain(replace, optarg, EBT_ACCEPT); + /* This is needed to get -N x -P y working */ + replace->selected_chain = ebt_get_chainnr(replace, optarg); + break; + } else if (c == ''X'') { + if (optind >= argc) { + replace->selected_chain = -1; + ebt_delete_chain(replace); + break; + } + + if (optind < argc - 1) + ebt_print_error2("No extra options allowed with -X"); + + if ((replace->selected_chain = ebt_get_chainnr(replace, argv[optind])) == -1) + ebt_print_error2("Chain ''%s'' doesn''t exist", argv[optind]); + ebt_delete_chain(replace); + if (ebt_errormsg[0] != ''\0'') + return -1; + optind++; + break; + } + + if ((replace->selected_chain = ebt_get_chainnr(replace, optarg)) == -1) + ebt_print_error2("Chain ''%s'' doesn''t exist", optarg); + if (c == ''E'') { + if (optind >= argc) + ebt_print_error2("No new chain name specified"); + else if (optind < argc - 1) + ebt_print_error2("No extra options allowed with -E"); + else if (strlen(argv[optind]) >= EBT_CHAIN_MAXNAMELEN) + ebt_print_error2("Chain name length can''t exceed %d characters", EBT_CHAIN_MAXNAMELEN - 1); + else if (ebt_get_chainnr(replace, argv[optind]) != -1) + ebt_print_error2("Chain ''%s'' already exists", argv[optind]); + else if (ebt_find_target(argv[optind])) + ebt_print_error2("Target with name ''%s'' exists", argv[optind]); + else if (strchr(argv[optind], '' '') != NULL) + ebt_print_error2("Use of '' '' not allowed in chain names"); + ebt_rename_chain(replace, argv[optind]); + optind++; + break; + } else if (c == ''D'' && optind < argc && (argv[optind][0] != ''-'' || (argv[optind][1] >= ''0'' && argv[optind][1] <= ''9''))) { + if (optind != argc - 1) + ebt_print_error2("No extra options allowed with -D start_nr[:end_nr]"); + if (parse_rule_range(argv[optind], &rule_nr, &rule_nr_end)) + ebt_print_error2("Problem with the specified rule number(s) ''%s''", argv[optind]); + optind++; + } else if (c == ''C'') { + if ((chcounter = parse_change_counters_rule(argc, argv, &rule_nr, &rule_nr_end, exec_style)) == -1) + return -1; + } else if (c == ''I'') { + if (optind >= argc || (argv[optind][0] == ''-'' && (argv[optind][1] < ''0'' || argv[optind][1] > ''9''))) + rule_nr = 1; + else { + rule_nr = strtol(argv[optind], &buffer, 10); + if (*buffer != ''\0'') + ebt_print_error2("Problem with the specified rule number ''%s''", argv[optind]); + optind++; + } + } else if (c == ''P'') { +handle_P: + if (optind >= argc) + ebt_print_error2("No policy specified"); + for (i = 0; i < NUM_STANDARD_TARGETS; i++) + if (!strcmp(argv[optind], ebt_standard_targets[i])) { + policy = -i -1; + if (policy == EBT_CONTINUE) + ebt_print_error2("Wrong policy ''%s''", argv[optind]); + break; + } + if (i == NUM_STANDARD_TARGETS) + ebt_print_error2("Unknown policy ''%s''", argv[optind]); + optind++; + } + break; + case ''L'': /* List */ + case ''F'': /* Flush */ + case ''Z'': /* Zero counters */ + if (c == ''Z'') { + if ((replace->flags & OPT_ZERO) || (replace->flags & OPT_COMMAND && replace->command != ''L'')) +print_zero: + ebt_print_error2("Command -Z only allowed together with command -L"); + replace->flags |= OPT_ZERO; + } else { + if (replace->flags & OPT_COMMAND) + ebt_print_error2("Multiple commands are not allowed"); + replace->command = c; + replace->flags |= OPT_COMMAND; + if (replace->flags & OPT_ZERO && c != ''L'') + goto print_zero; + } + +#ifdef SILENT_DAEMON + if (c== ''L'' && exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("-L not supported in daemon mode"); +#endif + + if (!(replace->flags & OPT_KERNELDATA)) + ebt_get_kernel_table(replace, 0); + i = -1; + if (optind < argc && argv[optind][0] != ''-'') { + if ((i = ebt_get_chainnr(replace, argv[optind])) == -1) + ebt_print_error2("Chain ''%s'' doesn''t exist", argv[optind]); + optind++; + } + if (i != -1) { + if (c == ''Z'') + zerochain = i; + else + replace->selected_chain = i; + } + break; + case ''V'': /* Version */ + if (OPT_COMMANDS) + ebt_print_error2("Multiple commands are not allowed"); + replace->command = ''V''; + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2(PROGNAME" v"PROGVERSION" ("PROGDATE")\n"); + PRINT_VERSION; + exit(0); + case ''M'': /* Modprobe */ + if (OPT_COMMANDS) + ebt_print_error2("Please put the -M option earlier"); + free(ebt_modprobe); + ebt_modprobe = optarg; + break; + case ''h'': /* Help */ +#ifdef SILENT_DAEMON + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("-h not supported in daemon mode"); +#endif + if (OPT_COMMANDS) + ebt_print_error2("Multiple commands are not allowed"); + replace->command = ''h''; + + /* All other arguments should be extension names */ + while (optind < argc) { + struct ebt_u_match *m; + struct ebt_u_watcher *w; + + if (!strcasecmp("list_extensions", argv[optind])) { + ebt_list_extensions(); + exit(0); + } + if ((m = ebt_find_match(argv[optind]))) + ebt_add_match(new_entry, m); + else if ((w = ebt_find_watcher(argv[optind]))) + ebt_add_watcher(new_entry, w); + else { + if (!(t = ebt_find_target(argv[optind]))) + ebt_print_error2("Extension ''%s'' not found", argv[optind]); + if (replace->flags & OPT_JUMP) + ebt_print_error2("Sorry, you can only see help for one target extension at a time"); + replace->flags |= OPT_JUMP; + new_entry->t = (struct ebt_entry_target *)t; + } + optind++; + } + break; + case ''t'': /* Table */ + if (OPT_COMMANDS) + ebt_print_error2("Please put the -t option first"); + ebt_check_option2(&(replace->flags), OPT_TABLE); + if (strlen(optarg) > EBT_TABLE_MAXNAMELEN - 1) + ebt_print_error2("Table name length cannot exceed %d characters", EBT_TABLE_MAXNAMELEN - 1); + strcpy(replace->name, optarg); + break; + case ''i'': /* Input interface */ + case 2 : /* Logical input interface */ + case ''o'': /* Output interface */ + case 3 : /* Logical output interface */ + case ''j'': /* Target */ + case ''p'': /* Net family protocol */ + case ''s'': /* Source mac */ + case ''d'': /* Destination mac */ + case ''c'': /* Set counters */ + if (!OPT_COMMANDS) + ebt_print_error2("No command specified"); + if (replace->command != ''A'' && replace->command != ''D'' && replace->command != ''I'' && replace->command != ''C'') + ebt_print_error2("Command and option do not match"); + if (c == ''i'') { + ebt_check_option2(&(replace->flags), OPT_IN); + if (replace->selected_chain > 2 && replace->selected_chain < NF_BR_BROUTING) + ebt_print_error2("Use -i only in INPUT, FORWARD, PREROUTING and BROUTING chains"); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_IIN; + + if (strlen(optarg) >= IFNAMSIZ) +big_iface_length: + ebt_print_error2("Interface name length cannot exceed %d characters", IFNAMSIZ - 1); + strcpy(new_entry->in, optarg); + if (parse_iface(new_entry->in, "-i")) + return -1; + break; + } else if (c == 2) { + ebt_check_option2(&(replace->flags), OPT_LOGICALIN); + if (replace->selected_chain > 2 && replace->selected_chain < NF_BR_BROUTING) + ebt_print_error2("Use --logical-in only in INPUT, FORWARD, PREROUTING and BROUTING chains"); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_ILOGICALIN; + + if (strlen(optarg) >= IFNAMSIZ) + goto big_iface_length; + strcpy(new_entry->logical_in, optarg); + if (parse_iface(new_entry->logical_in, "--logical-in")) + return -1; + break; + } else if (c == ''o'') { + ebt_check_option2(&(replace->flags), OPT_OUT); + if (replace->selected_chain < 2 || replace->selected_chain == NF_BR_BROUTING) + ebt_print_error2("Use -o only in OUTPUT, FORWARD and POSTROUTING chains"); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_IOUT; + + if (strlen(optarg) >= IFNAMSIZ) + goto big_iface_length; + strcpy(new_entry->out, optarg); + if (parse_iface(new_entry->out, "-o")) + return -1; + break; + } else if (c == 3) { + ebt_check_option2(&(replace->flags), OPT_LOGICALOUT); + if (replace->selected_chain < 2 || replace->selected_chain == NF_BR_BROUTING) + ebt_print_error2("Use --logical-out only in OUTPUT, FORWARD and POSTROUTING chains"); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_ILOGICALOUT; + + if (strlen(optarg) >= IFNAMSIZ) + goto big_iface_length; + strcpy(new_entry->logical_out, optarg); + if (parse_iface(new_entry->logical_out, "--logical-out")) + return -1; + break; + } else if (c == ''j'') { + ebt_check_option2(&(replace->flags), OPT_JUMP); + for (i = 0; i < NUM_STANDARD_TARGETS; i++) + if (!strcmp(optarg, ebt_standard_targets[i])) { + t = ebt_find_target(EBT_STANDARD_TARGET); + ((struct ebt_standard_target *) t->t)->verdict = -i - 1; + break; + } + if (-i - 1 == EBT_RETURN && replace->selected_chain < NF_BR_NUMHOOKS) { + ebt_print_error2("Return target only for user defined chains"); + } else if (i != NUM_STANDARD_TARGETS) + break; + + if ((i = ebt_get_chainnr(replace, optarg)) != -1) { + if (i < NF_BR_NUMHOOKS) + ebt_print_error2("Don''t jump to a standard chain"); + t = ebt_find_target(EBT_STANDARD_TARGET); + ((struct ebt_standard_target *) t->t)->verdict = i - NF_BR_NUMHOOKS; + break; + } else { + /* Must be an extension then */ + struct ebt_u_target *t; + + t = ebt_find_target(optarg); + /* -j standard not allowed either */ + if (!t || t == (struct ebt_u_target *)new_entry->t) + ebt_print_error2("Illegal target name ''%s''", optarg); + new_entry->t = (struct ebt_entry_target *)t; + ebt_find_target(EBT_STANDARD_TARGET)->used = 0; + t->used = 1; + } + break; + } else if (c == ''s'') { + ebt_check_option2(&(replace->flags), OPT_SOURCE); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_ISOURCE; + + if (ebt_get_mac_and_mask(optarg, new_entry->sourcemac, new_entry->sourcemsk)) + ebt_print_error2("Problem with specified source mac ''%s''", optarg); + new_entry->bitmask |= EBT_SOURCEMAC; + break; + } else if (c == ''d'') { + ebt_check_option2(&(replace->flags), OPT_DEST); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_IDEST; + + if (ebt_get_mac_and_mask(optarg, new_entry->destmac, new_entry->destmsk)) + ebt_print_error2("Problem with specified destination mac ''%s''", optarg); + new_entry->bitmask |= EBT_DESTMAC; + break; + } else if (c == ''c'') { + ebt_check_option2(&(replace->flags), OPT_COUNT); + if (ebt_check_inverse2(optarg)) + ebt_print_error2("Unexpected ''!'' after -c"); + if (optind >= argc || optarg[0] == ''-'' || argv[optind][0] == ''-'') + ebt_print_error2("Option -c needs 2 arguments"); + + new_entry->cnt.pcnt = strtoull(optarg, &buffer, 10); + if (*buffer != ''\0'') + ebt_print_error2("Packet counter ''%s'' invalid", optarg); + new_entry->cnt.bcnt = strtoull(argv[optind], &buffer, 10); + if (*buffer != ''\0'') + ebt_print_error2("Packet counter ''%s'' invalid", argv[optind]); + optind++; + break; + } + ebt_check_option2(&(replace->flags), OPT_PROTOCOL); + if (ebt_check_inverse2(optarg)) + new_entry->invflags |= EBT_IPROTO; + + new_entry->bitmask &= ~((unsigned int)EBT_NOPROTO); + i = strtol(optarg, &buffer, 16); + if (*buffer == ''\0'' && (i < 0 || i > 0xFFFF)) + ebt_print_error2("Problem with the specified protocol"); + if (*buffer != ''\0'') { + struct ethertypeent *ent; + + if (!strcasecmp(optarg, "LENGTH")) { + new_entry->bitmask |= EBT_802_3; + break; + } + ent = getethertypebyname(optarg); + if (!ent) + ebt_print_error2("Problem with the specified Ethernet protocol ''%s'', perhaps "_PATH_ETHERTYPES " is missing", optarg); + new_entry->ethproto = ent->e_ethertype; + } else + new_entry->ethproto = i; + + if (new_entry->ethproto < 0x0600) + ebt_print_error2("Sorry, protocols have values above or equal to 0x0600"); + break; + case 4 : /* Lc */ +#ifdef SILENT_DAEMON + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("--Lc is not supported in daemon mode"); +#endif + ebt_check_option2(&(replace->flags), LIST_C); + if (replace->command != ''L'') + ebt_print_error("Use --Lc with -L"); + replace->flags |= LIST_C; + break; + case 5 : /* Ln */ +#ifdef SILENT_DAEMON + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("--Ln is not supported in daemon mode"); +#endif + ebt_check_option2(&(replace->flags), LIST_N); + if (replace->command != ''L'') + ebt_print_error2("Use --Ln with -L"); + if (replace->flags & LIST_X) + ebt_print_error2("--Lx is not compatible with --Ln"); + replace->flags |= LIST_N; + break; + case 6 : /* Lx */ +#ifdef SILENT_DAEMON + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("--Lx is not supported in daemon mode"); +#endif + ebt_check_option2(&(replace->flags), LIST_X); + if (replace->command != ''L'') + ebt_print_error2("Use --Lx with -L"); + if (replace->flags & LIST_N) + ebt_print_error2("--Lx is not compatible with --Ln"); + replace->flags |= LIST_X; + break; + case 12 : /* Lmac2 */ +#ifdef SILENT_DAEMON + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error("--Lmac2 is not supported in daemon mode"); +#endif + ebt_check_option2(&(replace->flags), LIST_MAC2); + if (replace->command != ''L'') + ebt_print_error2("Use --Lmac2 with -L"); + replace->flags |= LIST_MAC2; + break; + case 8 : /* atomic-commit */ + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("--atomic-commit is not supported in daemon mode"); + replace->command = c; + if (OPT_COMMANDS) + ebt_print_error2("Multiple commands are not allowed"); + replace->flags |= OPT_COMMAND; + if (!replace->filename) + ebt_print_error2("No atomic file specified"); + /* Get the information from the file */ + ebt_get_table(replace, 0); + /* We don''t want the kernel giving us its counters, + * they would overwrite the counters extracted from + * the file */ + replace->num_counters = 0; + /* Make sure the table will be written to the kernel */ + free(replace->filename); + replace->filename = NULL; + break; + case 7 : /* atomic-init */ + case 10: /* atomic-save */ + case 11: /* init-table */ + if (exec_style == EXEC_STYLE_DAEMON) { + if (c == 7) { + ebt_print_error2("--atomic-init is not supported in daemon mode"); + } else if (c == 10) + ebt_print_error2("--atomic-save is not supported in daemon mode"); + ebt_print_error2("--init-table is not supported in daemon mode"); + } + replace->command = c; + if (OPT_COMMANDS) + ebt_print_error2("Multiple commands are not allowed"); + if (c != 11 && !replace->filename) + ebt_print_error2("No atomic file specified"); + replace->flags |= OPT_COMMAND; + { + char *tmp = replace->filename; + + /* Get the kernel table */ + replace->filename = NULL; + ebt_get_kernel_table(replace, c == 10 ? 0 : 1); + replace->filename = tmp; + } + break; + case 9 : /* atomic */ + if (exec_style == EXEC_STYLE_DAEMON) + ebt_print_error2("--atomic is not supported in daemon mode"); + if (OPT_COMMANDS) + ebt_print_error2("--atomic has to come before the command"); + /* A possible memory leak here, but this is not + * executed in daemon mode */ + replace->filename = (char *)malloc(strlen(optarg) + 1); + strcpy(replace->filename, optarg); + break; + case 1 : + if (!strcmp(optarg, "!")) + ebt_check_inverse2(optarg); + else + ebt_print_error2("Bad argument : ''%s''", optarg); + /* ebt_check_inverse() did optind++ */ + optind--; + continue; + default: + /* Is it a target option? */ + t = (struct ebt_u_target *)new_entry->t; + if ((t->parse(c - t->option_offset, argv, argc, new_entry, &t->flags, &t->t))) { + if (ebt_errormsg[0] != ''\0'') + return -1; + goto check_extension; + } + + /* Is it a match_option? */ + for (m = ebt_matches; m; m = m->next) + if (m->parse(c - m->option_offset, argv, argc, new_entry, &m->flags, &m->m)) + break; + + if (m != NULL) { + if (ebt_errormsg[0] != ''\0'') + return -1; + if (m->used == 0) { + ebt_add_match(new_entry, m); + m->used = 1; + } + goto check_extension; + } + + /* Is it a watcher option? */ + for (w = ebt_watchers; w; w = w->next) + if (w->parse(c - w->option_offset, argv, argc, new_entry, &w->flags, &w->w)) + break; + + if (w == NULL && c == ''?'') + ebt_print_error2("Unknown argument: ''%s''", argv[optind - 1], (char)optopt, (char)c); + else if (w == NULL) { + if (!strcmp(t->name, "standard")) + ebt_print_error2("Unknown argument: don''t forget the -t option"); + else + ebt_print_error2("Target-specific option does not correspond with specified target"); + } + if (ebt_errormsg[0] != ''\0'') + return -1; + if (w->used == 0) { + ebt_add_watcher(new_entry, w); + w->used = 1; + } +check_extension: + if (replace->command != ''A'' && replace->command != ''I'' && + replace->command != ''D'' && replace->command != ''C'') + ebt_print_error2("Extensions only for -A, -I, -D and -C"); + } + ebt_invert = 0; + } + + /* Just in case we didn''t catch an error */ + if (ebt_errormsg[0] != ''\0'') + return -1; + + if (!(table = ebt_find_table(replace->name))) + ebt_print_error2("Bad table name"); + + if (replace->command == ''h'' && !(replace->flags & OPT_ZERO)) { + print_help(); + if (exec_style == EXEC_STYLE_PRG) + exit(0); + } + + /* Do the final checks */ + if (replace->command == ''A'' || replace->command == ''I'' || + replace->command == ''D'' || replace->command == ''C'') { + /* This will put the hook_mask right for the chains */ + ebt_check_for_loops(replace); + if (ebt_errormsg[0] != ''\0'') + return -1; + entries = ebt_to_chain(replace); + m_l = new_entry->m_list; + w_l = new_entry->w_list; + t = (struct ebt_u_target *)new_entry->t; + while (m_l) { + m = (struct ebt_u_match *)(m_l->m); + m->final_check(new_entry, m->m, replace->name, + entries->hook_mask, 0); + if (ebt_errormsg[0] != ''\0'') + return -1; + m_l = m_l->next; + } + while (w_l) { + w = (struct ebt_u_watcher *)(w_l->w); + w->final_check(new_entry, w->w, replace->name, + entries->hook_mask, 0); + if (ebt_errormsg[0] != ''\0'') + return -1; + w_l = w_l->next; + } + t->final_check(new_entry, t->t, replace->name, + entries->hook_mask, 0); + if (ebt_errormsg[0] != ''\0'') + return -1; + } + /* So, the extensions can work with the host endian. + * The kernel does not have to do this of course */ + new_entry->ethproto = htons(new_entry->ethproto); + + if (replace->command == ''P'') { + if (replace->selected_chain < NF_BR_NUMHOOKS && policy == EBT_RETURN) + ebt_print_error2("Policy RETURN only allowed for user defined chains"); + ebt_change_policy(replace, policy); + if (ebt_errormsg[0] != ''\0'') + return -1; + } else if (replace->command == ''L'') { + list_rules(); + if (!(replace->flags & OPT_ZERO) && exec_style == EXEC_STYLE_PRG) + exit(0); + } + if (replace->flags & OPT_ZERO) { + replace->selected_chain = zerochain; + ebt_zero_counters(replace); + } else if (replace->command == ''F'') { + ebt_flush_chains(replace); + } else if (replace->command == ''A'' || replace->command == ''I'') { + ebt_add_rule(replace, new_entry, rule_nr); + if (ebt_errormsg[0] != ''\0'') + return -1; + /* Makes undoing the add easier (jumps to delete_the_rule) */ + if (rule_nr <= 0) + rule_nr--; + rule_nr_end = rule_nr; + + /* a jump to a udc requires checking for loops */ + if (!strcmp(new_entry->t->u.name, EBT_STANDARD_TARGET) && + ((struct ebt_standard_target *)(new_entry->t))->verdict >= 0) { + /* FIXME: this can be done faster */ + ebt_check_for_loops(replace); + if (ebt_errormsg[0] != ''\0'') + goto delete_the_rule; + } + + /* Do the final_check(), for all entries. + * This is needed when adding a rule that has a chain target */ + i = -1; + while (++i != replace->num_chains) { + struct ebt_u_entry *e; + + entries = replace->chains[i]; + if (!entries) { + if (i < NF_BR_NUMHOOKS) + continue; + else + ebt_print_bug("whoops\n"); + } + e = entries->entries->next; + while (e != entries->entries) { + /* Userspace extensions use host endian */ + e->ethproto = ntohs(e->ethproto); + ebt_do_final_checks(replace, e, entries); + if (ebt_errormsg[0] != ''\0'') + goto delete_the_rule; + e->ethproto = htons(e->ethproto); + e = e->next; + } + } + /* Don''t reuse the added rule */ + new_entry = NULL; + } else if (replace->command == ''D'') { +delete_the_rule: + ebt_delete_rule(replace, new_entry, rule_nr, rule_nr_end); + if (ebt_errormsg[0] != ''\0'') + return -1; + } else if (replace->command == ''C'') { + ebt_change_counters(replace, new_entry, rule_nr, rule_nr_end, &(new_entry->cnt_surplus), chcounter); + if (ebt_errormsg[0] != ''\0'') + return -1; + } + /* Commands -N, -E, -X, --atomic-commit, --atomic-commit, --atomic-save, + * --init-table fall through */ + + if (ebt_errormsg[0] != ''\0'') + return -1; + if (table->check) + table->check(replace); + + if (exec_style == EXEC_STYLE_PRG) {/* Implies ebt_errormsg[0] == ''\0'' */ + ebt_deliver_table(replace); + + if (replace->nentries) + ebt_deliver_counters(replace); + } + return 0; +} diff --git a/tools/remus/imqebt/extensions/Makefile b/tools/remus/imqebt/extensions/Makefile new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/extensions/Makefile @@ -0,0 +1,29 @@ +#! /usr/bin/make + +EXT_FUNC+=standard imq +EXT_TABLES+=filter +EXT_OBJS+=$(foreach T,$(EXT_FUNC), extensions/ebt_$(T).o) +EXT_OBJS+=$(foreach T,$(EXT_TABLES), extensions/ebtable_$(T).o) +EXT_LIBS+=$(foreach T,$(EXT_FUNC), extensions/libebt_$(T).so) +EXT_LIBS+=$(foreach T,$(EXT_TABLES), extensions/libebtable_$(T).so) +EXT_LIBSI+=$(foreach T,$(EXT_FUNC), -lebt_$(T)) +EXT_LIBSI+=$(foreach T,$(EXT_TABLES), -lebtable_$(T)) + +extensions/ebt_%.so: extensions/ebt_%.o + $(CC) -shared -o $@ -lc $< -nostartfiles + +extensions/libebt_%.so: extensions/ebt_%.so + mv $< $@ + +extensions/ebtable_%.so: extensions/ebtable_%.o + $(CC) -shared -o $@ -lc $< -nostartfiles + +extensions/libebtable_%.so: extensions/ebtable_%.so + mv $< $@ + +extensions/ebt_%.o: extensions/ebt_%.c include/ebtables_u.h + $(CC) $(CFLAGS) $(CFLAGS_SH_LIB) $(PROGSPECS) -c -o $@ $< -I$(KERNEL_INCLUDES) + +extensions/ebtable_%.o: extensions/ebtable_%.c + $(CC) $(CFLAGS) $(CFLAGS_SH_LIB) $(PROGSPECS) -c -o $@ $< -I$(KERNEL_INCLUDES) + diff --git a/tools/remus/imqebt/extensions/ebt_imq.c b/tools/remus/imqebt/extensions/ebt_imq.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/extensions/ebt_imq.c @@ -0,0 +1,84 @@ +#include <stdio.h> +#include <getopt.h> +#include <stdlib.h> +#include "../include/ebtables_u.h" +#include <linux/netfilter_bridge/ebt_imq.h> + +#define IMQ_TODEV ''1'' + +static struct option opts[] +{ + { "todev" , required_argument, 0, IMQ_TODEV }, + { 0 } +}; + +static void help(void) +{ + printf( + "IMQ options:\n" + " --todev <N> enqueue to imq<N>, defaults to 0\n"); +} + +static void init(struct ebt_entry_target *target) +{ + struct ebt_imq_info *imqinfo = (struct ebt_imq_info *)target->data; + + imqinfo->todev = 0; +} + +static int parse(int c, char **argv, int argc, const struct ebt_u_entry *entry, + unsigned int *flags, struct ebt_entry_target **target) +{ + struct ebt_imq_info *imqinfo = (struct ebt_imq_info *)(*target)->data; + + switch(c) { + case IMQ_TODEV: + imqinfo->todev = atoi(optarg); + } + + return 1; +} + +static void final_check(const struct ebt_u_entry *entry, + const struct ebt_entry_target *target, const char *name, + unsigned int hookmask, unsigned int time) +{ +} + +static void print(const struct ebt_u_entry *entry, + const struct ebt_entry_target *target) +{ + struct ebt_imq_info *imqinfo = (struct ebt_imq_info *)target->data; + + printf("--todev %d", imqinfo->todev); +} + +static int compare(const struct ebt_entry_target *t1, + const struct ebt_entry_target *t2) +{ + struct ebt_imq_info *imqinfo1 = (struct ebt_imq_info *)t1->data; + struct ebt_imq_info *imqinfo2 = (struct ebt_imq_info *)t2->data; + + if (imqinfo1->todev != imqinfo2->todev) + return 0; + + return 1; +} + +static struct ebt_u_target imq_target +{ + .name = "imq", + .size = sizeof(struct ebt_imq_info), + .help = help, + .init = init, + .parse = parse, + .final_check = final_check, + .print = print, + .compare = compare, + .extra_ops = opts, +}; + +void _init(void) +{ + ebt_register_target(&imq_target); +} diff --git a/tools/remus/imqebt/extensions/ebt_standard.c b/tools/remus/imqebt/extensions/ebt_standard.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/extensions/ebt_standard.c @@ -0,0 +1,90 @@ +/* ebt_standard + * + * Authors: + * Bart De Schuymer <bdschuym@pandora.be> + * + * April, 2002 + */ + +#include <stdio.h> +#include <stdlib.h> +#include <getopt.h> +#include "../include/ebtables_u.h" + +static struct option opts[] +{ + {0} +}; + +static void print_help(void) +{ + printf("Standard targets: DROP, ACCEPT, RETURN or CONTINUE;\n" + "The target can also be a user defined chain.\n"); +} + +static void init(struct ebt_entry_target *t) +{ + ((struct ebt_standard_target *)t)->verdict = EBT_CONTINUE; +} + +static int parse(int c, char **argv, int argc, const struct ebt_u_entry *entry, + unsigned int *flags, struct ebt_entry_target **target) +{ + return 0; +} + +static void final_check(const struct ebt_u_entry *entry, + const struct ebt_entry_target *target, const char *name, + unsigned int hookmask, unsigned int time) +{ +} + +static void print(const struct ebt_u_entry *entry, + const struct ebt_entry_target *target) +{ + int verdict = ((struct ebt_standard_target *)target)->verdict; + + if (verdict >= 0) { + struct ebt_u_entries *entries; + + entries = entry->replace->chains[verdict + NF_BR_NUMHOOKS]; + printf("%s", entries->name); + return; + } + if (verdict == EBT_CONTINUE) + printf("CONTINUE "); + else if (verdict == EBT_ACCEPT) + printf("ACCEPT "); + else if (verdict == EBT_DROP) + printf("DROP "); + else if (verdict == EBT_RETURN) + printf("RETURN "); + else + ebt_print_bug("Bad standard target"); +} + +static int compare(const struct ebt_entry_target *t1, + const struct ebt_entry_target *t2) +{ + return ((struct ebt_standard_target *)t1)->verdict =+ ((struct ebt_standard_target *)t2)->verdict; +} + +static struct ebt_u_target standard +{ + .name = "standard", + .size = sizeof(struct ebt_standard_target) - + sizeof(struct ebt_entry_target), + .help = print_help, + .init = init, + .parse = parse, + .final_check = final_check, + .print = print, + .compare = compare, + .extra_ops = opts, +}; + +void _init(void) +{ + ebt_register_target(&standard); +} diff --git a/tools/remus/imqebt/extensions/ebtable_filter.c b/tools/remus/imqebt/extensions/ebtable_filter.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/extensions/ebtable_filter.c @@ -0,0 +1,35 @@ +/* ebtable_filter + * + * Authors: + * Bart De Schuymer <bdschuym@pandora.be> + * + * April, 2002 + */ + +#include <stdio.h> +#include "../include/ebtables_u.h" + +#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ + (1 << NF_BR_LOCAL_OUT)) + +static void print_help(const char **hn) +{ + int i; + + printf("Supported chains for the filter table:\n"); + for (i = 0; i < NF_BR_NUMHOOKS; i++) + if (FILTER_VALID_HOOKS & (1 << i)) + printf("%s ", hn[i]); + printf("\n"); +} + +static struct ebt_u_table table +{ + .name = "filter", + .help = print_help, +}; + +void _init(void) +{ + ebt_register_table(&table); +} diff --git a/tools/remus/imqebt/getethertype.c b/tools/remus/imqebt/getethertype.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/getethertype.c @@ -0,0 +1,162 @@ +/* +* getethertype.c +* +* This file was part of the NYS Library. +* +** The NYS Library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Library General Public License as +** published by the Free Software Foundation; either version 2 of the +** License, or (at your option) any later version. +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +/******************************************************************** +* Description: Ethertype name service switch and the ethertypes +* database access functions +* Author: Nick Fedchik <fnm@ukrsat.com> +* Checker: Bart De Schuymer <bdschuym@pandora.be> +* Origin: uClibc-0.9.16/libc/inet/getproto.c +* Created at: Mon Nov 11 12:20:11 EET 2002 +********************************************************************/ + + +#include <ctype.h> +#include <features.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netinet/ether.h> +#include <net/ethernet.h> + +#include "ethernetdb.h" + +#define MAXALIASES 35 + +static FILE *etherf = NULL; +static char line[BUFSIZ + 1]; +static struct ethertypeent et_ent; +static char *ethertype_aliases[MAXALIASES]; +static int ethertype_stayopen; + +void setethertypeent(int f) +{ + if (etherf == NULL) + etherf = fopen(_PATH_ETHERTYPES, "r"); + else + rewind(etherf); + ethertype_stayopen |= f; +} + +void endethertypeent(void) +{ + if (etherf) { + fclose(etherf); + etherf = NULL; + } + ethertype_stayopen = 0; +} + +struct ethertypeent *getethertypeent(void) +{ + char *e; + char *endptr; + register char *cp, **q; + + if (etherf == NULL + && (etherf = fopen(_PATH_ETHERTYPES, "r")) == NULL) { + return (NULL); + } + +again: + if ((e = fgets(line, BUFSIZ, etherf)) == NULL) { + return (NULL); + } + if (*e == ''#'') + goto again; + cp = strpbrk(e, "#\n"); + if (cp == NULL) + goto again; + *cp = ''\0''; + et_ent.e_name = e; + cp = strpbrk(e, " \t"); + if (cp == NULL) + goto again; + *cp++ = ''\0''; + while (*cp == '' '' || *cp == ''\t'') + cp++; + e = strpbrk(cp, " \t"); + if (e != NULL) + *e++ = ''\0''; +// Check point + et_ent.e_ethertype = strtol(cp, &endptr, 16); + if (*endptr != ''\0'' + || (et_ent.e_ethertype < ETH_ZLEN + || et_ent.e_ethertype > 0xFFFF)) + goto again; // Skip invalid etherproto type entry + q = et_ent.e_aliases = ethertype_aliases; + if (e != NULL) { + cp = e; + while (cp && *cp) { + if (*cp == '' '' || *cp == ''\t'') { + cp++; + continue; + } + if (q < ðertype_aliases[MAXALIASES - 1]) + *q++ = cp; + cp = strpbrk(cp, " \t"); + if (cp != NULL) + *cp++ = ''\0''; + } + } + *q = NULL; + return (&et_ent); +} + + +struct ethertypeent *getethertypebyname(const char *name) +{ + register struct ethertypeent *e; + register char **cp; + + setethertypeent(ethertype_stayopen); + while ((e = getethertypeent()) != NULL) { + if (strcasecmp(e->e_name, name) == 0) + break; + for (cp = e->e_aliases; *cp != 0; cp++) + if (strcasecmp(*cp, name) == 0) + goto found; + } +found: + if (!ethertype_stayopen) + endethertypeent(); + return (e); +} + +struct ethertypeent *getethertypebynumber(int type) +{ + register struct ethertypeent *e; + + setethertypeent(ethertype_stayopen); + while ((e = getethertypeent()) != NULL) + if (e->e_ethertype == type) + break; + if (!ethertype_stayopen) + endethertypeent(); + return (e); +} diff --git a/tools/remus/imqebt/include/ebtables_u.h b/tools/remus/imqebt/include/ebtables_u.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/ebtables_u.h @@ -0,0 +1,379 @@ +/* + * $Id: ebtables.c,v 1.03 2002/01/19 + * + * Copyright (C) 2001-2002 Bart De Schuymer + * + * This code is stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty'' Russell & Michael J. Neuling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EBTABLES_U_H +#define EBTABLES_U_H +#include <netinet/in.h> +#include <linux/netfilter_bridge/ebtables.h> +#include <linux/netfilter/x_tables.h> + +#ifndef IPPROTO_SCTP +#define IPPROTO_SCTP 132 +#endif +#ifndef IPPROTO_DCCP +#define IPPROTO_DCCP 33 +#endif + +#define EXEC_STYLE_PRG 0 +#define EXEC_STYLE_DAEMON 1 + +#ifndef EBT_MIN_ALIGN +#define EBT_MIN_ALIGN (__alignof__(struct _xt_align)) +#endif +#define EBT_ALIGN(s) (((s) + (EBT_MIN_ALIGN-1)) & ~(EBT_MIN_ALIGN-1)) +#define ERRORMSG_MAXLEN 128 + +struct ebt_u_entries +{ + int policy; + unsigned int nentries; + /* counter offset for this chain */ + unsigned int counter_offset; + /* used for udc */ + unsigned int hook_mask; + char *kernel_start; + char name[EBT_CHAIN_MAXNAMELEN]; + struct ebt_u_entry *entries; +}; + +struct ebt_cntchanges +{ + unsigned short type; + unsigned short change; /* determines incremental/decremental/change */ + struct ebt_cntchanges *prev; + struct ebt_cntchanges *next; +}; + +#define EBT_ORI_MAX_CHAINS 10 +struct ebt_u_replace +{ + char name[EBT_TABLE_MAXNAMELEN]; + unsigned int valid_hooks; + /* nr of rules in the table */ + unsigned int nentries; + unsigned int num_chains; + unsigned int max_chains; + struct ebt_u_entries **chains; + /* nr of counters userspace expects back */ + unsigned int num_counters; + /* where the kernel will put the old counters */ + struct ebt_counter *counters; + /* + * can be used e.g. to know if a standard option + * has been specified twice + */ + unsigned int flags; + /* we stick the specified command (e.g. -A) in here */ + char command; + /* + * here we stick the chain to do our thing on (can be -1 if unspecified) + */ + int selected_chain; + /* used for the atomic option */ + char *filename; + /* tells what happened to the old rules (counter changes) */ + struct ebt_cntchanges *cc; +}; + +struct ebt_u_table +{ + char name[EBT_TABLE_MAXNAMELEN]; + void (*check)(struct ebt_u_replace *repl); + void (*help)(const char **); + struct ebt_u_table *next; +}; + +struct ebt_u_match_list +{ + struct ebt_u_match_list *next; + struct ebt_entry_match *m; +}; + +struct ebt_u_watcher_list +{ + struct ebt_u_watcher_list *next; + struct ebt_entry_watcher *w; +}; + +struct ebt_u_entry +{ + unsigned int bitmask; + unsigned int invflags; + uint16_t ethproto; + char in[IFNAMSIZ]; + char logical_in[IFNAMSIZ]; + char out[IFNAMSIZ]; + char logical_out[IFNAMSIZ]; + unsigned char sourcemac[ETH_ALEN]; + unsigned char sourcemsk[ETH_ALEN]; + unsigned char destmac[ETH_ALEN]; + unsigned char destmsk[ETH_ALEN]; + struct ebt_u_match_list *m_list; + struct ebt_u_watcher_list *w_list; + struct ebt_entry_target *t; + struct ebt_u_entry *prev; + struct ebt_u_entry *next; + struct ebt_counter cnt; + struct ebt_counter cnt_surplus; /* for increasing/decreasing a counter and for option ''C'' */ + struct ebt_cntchanges *cc; + /* the standard target needs this to know the name of a udc when + * printing out rules. */ + struct ebt_u_replace *replace; +}; + +struct ebt_u_match +{ + char name[EBT_FUNCTION_MAXNAMELEN]; + /* size of the real match data */ + unsigned int size; + void (*help)(void); + void (*init)(struct ebt_entry_match *m); + int (*parse)(int c, char **argv, int argc, + const struct ebt_u_entry *entry, unsigned int *flags, + struct ebt_entry_match **match); + void (*final_check)(const struct ebt_u_entry *entry, + const struct ebt_entry_match *match, + const char *name, unsigned int hookmask, unsigned int time); + void (*print)(const struct ebt_u_entry *entry, + const struct ebt_entry_match *match); + int (*compare)(const struct ebt_entry_match *m1, + const struct ebt_entry_match *m2); + const struct option *extra_ops; + /* + * can be used e.g. to check for multiple occurance of the same option + */ + unsigned int flags; + unsigned int option_offset; + struct ebt_entry_match *m; + /* + * if used == 1 we no longer have to add it to + * the match chain of the new entry + * be sure to put it back on 0 when finished + */ + unsigned int used; + struct ebt_u_match *next; +}; + +struct ebt_u_watcher +{ + char name[EBT_FUNCTION_MAXNAMELEN]; + unsigned int size; + void (*help)(void); + void (*init)(struct ebt_entry_watcher *w); + int (*parse)(int c, char **argv, int argc, + const struct ebt_u_entry *entry, unsigned int *flags, + struct ebt_entry_watcher **watcher); + void (*final_check)(const struct ebt_u_entry *entry, + const struct ebt_entry_watcher *watch, const char *name, + unsigned int hookmask, unsigned int time); + void (*print)(const struct ebt_u_entry *entry, + const struct ebt_entry_watcher *watcher); + int (*compare)(const struct ebt_entry_watcher *w1, + const struct ebt_entry_watcher *w2); + const struct option *extra_ops; + unsigned int flags; + unsigned int option_offset; + struct ebt_entry_watcher *w; + unsigned int used; + struct ebt_u_watcher *next; +}; + +struct ebt_u_target +{ + char name[EBT_FUNCTION_MAXNAMELEN]; + unsigned int size; + void (*help)(void); + void (*init)(struct ebt_entry_target *t); + int (*parse)(int c, char **argv, int argc, + const struct ebt_u_entry *entry, unsigned int *flags, + struct ebt_entry_target **target); + void (*final_check)(const struct ebt_u_entry *entry, + const struct ebt_entry_target *target, const char *name, + unsigned int hookmask, unsigned int time); + void (*print)(const struct ebt_u_entry *entry, + const struct ebt_entry_target *target); + int (*compare)(const struct ebt_entry_target *t1, + const struct ebt_entry_target *t2); + const struct option *extra_ops; + unsigned int option_offset; + unsigned int flags; + struct ebt_entry_target *t; + unsigned int used; + struct ebt_u_target *next; +}; + +/* libebtc.c */ + +extern struct ebt_u_table *ebt_tables; +extern struct ebt_u_match *ebt_matches; +extern struct ebt_u_watcher *ebt_watchers; +extern struct ebt_u_target *ebt_targets; + +void ebt_register_table(struct ebt_u_table *); +void ebt_register_match(struct ebt_u_match *); +void ebt_register_watcher(struct ebt_u_watcher *); +void ebt_register_target(struct ebt_u_target *t); +int ebt_get_kernel_table(struct ebt_u_replace *replace, int init); +struct ebt_u_target *ebt_find_target(const char *name); +struct ebt_u_match *ebt_find_match(const char *name); +struct ebt_u_watcher *ebt_find_watcher(const char *name); +struct ebt_u_table *ebt_find_table(const char *name); +int ebtables_insmod(const char *modname); +void ebt_list_extensions(void); +void ebt_initialize_entry(struct ebt_u_entry *e); +void ebt_cleanup_replace(struct ebt_u_replace *replace); +void ebt_reinit_extensions(void); +void ebt_double_chains(struct ebt_u_replace *replace); +void ebt_free_u_entry(struct ebt_u_entry *e); +struct ebt_u_entries *ebt_name_to_chain(const struct ebt_u_replace *replace, + const char* arg); +struct ebt_u_entries *ebt_name_to_chain(const struct ebt_u_replace *replace, + const char* arg); +int ebt_get_chainnr(const struct ebt_u_replace *replace, const char* arg); +/**/ +void ebt_change_policy(struct ebt_u_replace *replace, int policy); +void ebt_flush_chains(struct ebt_u_replace *replace); +int ebt_check_rule_exists(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry); +void ebt_add_rule(struct ebt_u_replace *replace, struct ebt_u_entry *new_entry, + int rule_nr); +void ebt_delete_rule(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry, int begin, int end); +void ebt_zero_counters(struct ebt_u_replace *replace); +void ebt_change_counters(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry, int begin, int end, + struct ebt_counter *cnt, int mask); +void ebt_new_chain(struct ebt_u_replace *replace, const char *name, int policy); +void ebt_delete_chain(struct ebt_u_replace *replace); +void ebt_rename_chain(struct ebt_u_replace *replace, const char *name); +/**/ +void ebt_do_final_checks(struct ebt_u_replace *replace, struct ebt_u_entry *e, + struct ebt_u_entries *entries); +int ebt_check_for_references(struct ebt_u_replace *replace, int print_err); +int ebt_check_for_references2(struct ebt_u_replace *replace, int chain_nr, + int print_err); +void ebt_check_for_loops(struct ebt_u_replace *replace); +void ebt_add_match(struct ebt_u_entry *new_entry, struct ebt_u_match *m); +void ebt_add_watcher(struct ebt_u_entry *new_entry, struct ebt_u_watcher *w); +void ebt_iterate_matches(void (*f)(struct ebt_u_match *)); +void ebt_iterate_watchers(void (*f)(struct ebt_u_watcher *)); +void ebt_iterate_targets(void (*f)(struct ebt_u_target *)); +void __ebt_print_bug(char *file, int line, char *format, ...); +void __ebt_print_error(char *format, ...); + +/* communication.c */ + +int ebt_get_table(struct ebt_u_replace *repl, int init); +void ebt_deliver_counters(struct ebt_u_replace *repl); +void ebt_deliver_table(struct ebt_u_replace *repl); + +/* useful_functions.c */ + +extern int ebt_invert; +void ebt_check_option(unsigned int *flags, unsigned int mask); +#define ebt_check_inverse(arg) _ebt_check_inverse(arg, argc, argv) +int _ebt_check_inverse(const char option[], int argc, char **argv); +void ebt_print_mac(const unsigned char *mac); +void ebt_print_mac_and_mask(const unsigned char *mac, const unsigned char *mask); +int ebt_get_mac_and_mask(const char *from, unsigned char *to, unsigned char *mask); +void ebt_parse_ip_address(char *address, uint32_t *addr, uint32_t *msk); +char *ebt_mask_to_dotted(uint32_t mask); +void ebt_parse_ip6_address(char *address, struct in6_addr *addr, + struct in6_addr *msk); +char *ebt_ip6_to_numeric(const struct in6_addr *addrp); + + +int do_command(int argc, char *argv[], int exec_style, + struct ebt_u_replace *replace_); + +struct ethertypeent *parseethertypebynumber(int type); + +#define ebt_to_chain(repl) \ +({struct ebt_u_entries *_ch = NULL; \ +if (repl->selected_chain != -1) \ + _ch = repl->chains[repl->selected_chain]; \ +_ch;}) +#define ebt_print_bug(format, args...) \ + __ebt_print_bug(__FILE__, __LINE__, format, ##args) +#define ebt_print_error(format,args...) __ebt_print_error(format, ##args); +#define ebt_print_error2(format, args...) do {__ebt_print_error(format, ##args); \ + return -1;} while (0) +#define ebt_check_option2(flags,mask) \ +({ebt_check_option(flags,mask); \ + if (ebt_errormsg[0] != ''\0'') \ + return -1;}) +#define ebt_check_inverse2(option) \ +({int __ret = ebt_check_inverse(option); \ +if (ebt_errormsg[0] != ''\0'') \ + return -1; \ +if (!optarg) { \ + __ebt_print_error("Option without (mandatory) argument"); \ + return -1; \ +} \ +__ret;}) +#define ebt_print_memory() do {printf("Ebtables: " __FILE__ \ + " %s %d :Out of memory.\n", __FUNCTION__, __LINE__); exit(-1);} while (0) + +/* used for keeping the rule counters right during rule adds or deletes */ +#define CNT_NORM 0 +#define CNT_DEL 1 +#define CNT_ADD 2 +#define CNT_CHANGE 3 + +extern const char *ebt_hooknames[NF_BR_NUMHOOKS]; +extern const char *ebt_standard_targets[NUM_STANDARD_TARGETS]; +extern char ebt_errormsg[ERRORMSG_MAXLEN]; +extern char *ebt_modprobe; +extern int ebt_silent; +extern int ebt_printstyle_mac; + +/* + * Transforms a target string into the right integer, + * returns 0 on success. + */ +#define FILL_TARGET(_str, _pos) ({ \ + int _i, _ret = 0; \ + for (_i = 0; _i < NUM_STANDARD_TARGETS; _i++) \ + if (!strcmp(_str, ebt_standard_targets[_i])) {\ + _pos = -_i - 1; \ + break; \ + } \ + if (_i == NUM_STANDARD_TARGETS) \ + _ret = 1; \ + _ret; \ +}) + +/* Transforms the target value to an index into standard_targets[] */ +#define TARGET_INDEX(_value) (-_value - 1) +/* Returns a target string corresponding to the value */ +#define TARGET_NAME(_value) (ebt_standard_targets[TARGET_INDEX(_value)]) +/* True if the hook mask denotes that the rule is in a base chain */ +#define BASE_CHAIN (hookmask & (1 << NF_BR_NUMHOOKS)) +/* Clear the bit in the hook_mask that tells if the rule is on a base chain */ +#define CLEAR_BASE_CHAIN_BIT (hookmask &= ~(1 << NF_BR_NUMHOOKS)) +#define PRINT_VERSION printf(PROGNAME" v"PROGVERSION" ("PROGDATE")\n") +#ifndef PROC_SYS_MODPROBE +#define PROC_SYS_MODPROBE "/proc/sys/kernel/modprobe" +#endif +#define ATOMIC_ENV_VARIABLE "EBTABLES_ATOMIC_FILE" +#endif /* EBTABLES_U_H */ diff --git a/tools/remus/imqebt/include/ethernetdb.h b/tools/remus/imqebt/include/ethernetdb.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/ethernetdb.h @@ -0,0 +1,58 @@ +/* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +/* All data returned by the network data base library are supplied in + host order and returned in network order (suitable for use in + system calls). */ + +#ifndef _ETHERNETDB_H +#define _ETHERNETDB_H 1 + +#include <features.h> +#include <netinet/in.h> +#include <stdint.h> + +/* Absolute file name for network data base files. */ +#ifndef _PATH_ETHERTYPES +#define _PATH_ETHERTYPES "/etc/ethertypes" +#endif /* _PATH_ETHERTYPES */ + +struct ethertypeent { + char *e_name; /* Official ethernet type name. */ + char **e_aliases; /* Alias list. */ + int e_ethertype; /* Ethernet type number. */ +}; + +/* Open ethertype data base files and mark them as staying open even + after a later search if STAY_OPEN is non-zero. */ +extern void setethertypeent(int __stay_open) __THROW; + +/* Close ethertype data base files and clear `stay open'' flag. */ +extern void endethertypeent(void) __THROW; + +/* Get next entry from ethertype data base file. Open data base if + necessary. */ +extern struct ethertypeent *getethertypeent(void) __THROW; + +/* Return entry from ethertype data base for network with NAME. */ +extern struct ethertypeent *getethertypebyname(__const char *__name) + __THROW; + +/* Return entry from ethertype data base which number is PROTO. */ +extern struct ethertypeent *getethertypebynumber(int __ethertype) __THROW; + + +#endif /* ethernetdb.h */ diff --git a/tools/remus/imqebt/include/linux/if_ether.h b/tools/remus/imqebt/include/linux/if_ether.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/linux/if_ether.h @@ -0,0 +1,146 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Global definitions for the Ethernet IEEE 802.3 interface. + * + * Version: @(#)if_ether.h 1.0.1a 02/08/94 + * + * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <alan@lxorguk.ukuu.org.uk> + * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_IF_ETHER_H +#define _LINUX_IF_ETHER_H + +#include <linux/types.h> + +/* + * IEEE 802.3 Ethernet magic constants. The frame sizes omit the preamble + * and FCS/CRC (frame check sequence). + */ + +#define ETH_ALEN 6 /* Octets in one ethernet addr */ +#define ETH_HLEN 14 /* Total octets in header. */ +#define ETH_ZLEN 60 /* Min. octets in frame sans FCS */ +#define ETH_DATA_LEN 1500 /* Max. octets in payload */ +#define ETH_FRAME_LEN 1514 /* Max. octets in frame sans FCS */ +#define ETH_FCS_LEN 4 /* Octets in the FCS */ + +/* + * These are the defined Ethernet Protocol ID''s. + */ + +#define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ +#define ETH_P_PUP 0x0200 /* Xerox PUP packet */ +#define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ +#define ETH_P_X25 0x0805 /* CCITT X.25 */ +#define ETH_P_ARP 0x0806 /* Address Resolution packet */ +#define ETH_P_BPQ 0x08FF /* G8BPQ AX.25 Ethernet Packet [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IEEEPUP 0x0a00 /* Xerox IEEE802.3 PUP packet */ +#define ETH_P_IEEEPUPAT 0x0a01 /* Xerox IEEE802.3 PUP Addr Trans packet */ +#define ETH_P_DEC 0x6000 /* DEC Assigned proto */ +#define ETH_P_DNA_DL 0x6001 /* DEC DNA Dump/Load */ +#define ETH_P_DNA_RC 0x6002 /* DEC DNA Remote Console */ +#define ETH_P_DNA_RT 0x6003 /* DEC DNA Routing */ +#define ETH_P_LAT 0x6004 /* DEC LAT */ +#define ETH_P_DIAG 0x6005 /* DEC Diagnostics */ +#define ETH_P_CUST 0x6006 /* DEC Customer use */ +#define ETH_P_SCA 0x6007 /* DEC Systems Comms Arch */ +#define ETH_P_TEB 0x6558 /* Trans Ether Bridging */ +#define ETH_P_RARP 0x8035 /* Reverse Addr Res packet */ +#define ETH_P_ATALK 0x809B /* Appletalk DDP */ +#define ETH_P_AARP 0x80F3 /* Appletalk AARP */ +#define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */ +#define ETH_P_IPX 0x8137 /* IPX over DIX */ +#define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ +#define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */ +#define ETH_P_SLOW 0x8809 /* Slow Protocol. See 802.3ad 43B */ +#define ETH_P_WCCP 0x883E /* Web-cache coordination protocol + * defined in draft-wilson-wrec-wccp-v2-00.txt */ +#define ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages */ +#define ETH_P_PPP_SES 0x8864 /* PPPoE session messages */ +#define ETH_P_MPLS_UC 0x8847 /* MPLS Unicast traffic */ +#define ETH_P_MPLS_MC 0x8848 /* MPLS Multicast traffic */ +#define ETH_P_ATMMPOA 0x884c /* MultiProtocol Over ATM */ +#define ETH_P_ATMFATE 0x8884 /* Frame-based ATM Transport + * over Ethernet + */ +#define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */ +#define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ +#define ETH_P_TIPC 0x88CA /* TIPC */ +#define ETH_P_FCOE 0x8906 /* Fibre Channel over Ethernet */ +#define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ + +/* + * Non DIX types. Won''t clash for 1500 types. + */ + +#define ETH_P_802_3 0x0001 /* Dummy type for 802.3 frames */ +#define ETH_P_AX25 0x0002 /* Dummy protocol id for AX.25 */ +#define ETH_P_ALL 0x0003 /* Every packet (be careful!!!) */ +#define ETH_P_802_2 0x0004 /* 802.2 frames */ +#define ETH_P_SNAP 0x0005 /* Internal only */ +#define ETH_P_DDCMP 0x0006 /* DEC DDCMP: Internal only */ +#define ETH_P_WAN_PPP 0x0007 /* Dummy type for WAN PPP frames*/ +#define ETH_P_PPP_MP 0x0008 /* Dummy type for PPP MP frames */ +#define ETH_P_LOCALTALK 0x0009 /* Localtalk pseudo type */ +#define ETH_P_CAN 0x000C /* Controller Area Network */ +#define ETH_P_PPPTALK 0x0010 /* Dummy type for Atalk over PPP*/ +#define ETH_P_TR_802_2 0x0011 /* 802.2 frames */ +#define ETH_P_MOBITEX 0x0015 /* Mobitex (kaz@cafe.net) */ +#define ETH_P_CONTROL 0x0016 /* Card specific control frames */ +#define ETH_P_IRDA 0x0017 /* Linux-IrDA */ +#define ETH_P_ECONET 0x0018 /* Acorn Econet */ +#define ETH_P_HDLC 0x0019 /* HDLC frames */ +#define ETH_P_ARCNET 0x001A /* 1A for ArcNet :-) */ +#define ETH_P_DSA 0x001B /* Distributed Switch Arch. */ +#define ETH_P_TRAILER 0x001C /* Trailer switch tagging */ +#define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ + +/* + * This is an Ethernet frame header. + */ + +struct ethhdr { + unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ + unsigned char h_source[ETH_ALEN]; /* source ether addr */ + __be16 h_proto; /* packet type ID field */ +} __attribute__((packed)); + +#ifdef __KERNEL__ +#include <linux/skbuff.h> + +static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb_mac_header(skb); +} + +int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); + +#ifdef CONFIG_SYSCTL +extern struct ctl_table ether_table[]; +#endif + +extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); + +/* + * Display a 6 byte device address (MAC) in a readable format. + */ +extern char *print_mac(char *buf, const unsigned char *addr); +#define MAC_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define MAC_BUF_SIZE 18 +#define DECLARE_MAC_BUF(var) char var[MAC_BUF_SIZE] __maybe_unused + +#endif + +#endif /* _LINUX_IF_ETHER_H */ diff --git a/tools/remus/imqebt/include/linux/netfilter_bridge.h b/tools/remus/imqebt/include/linux/netfilter_bridge.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/linux/netfilter_bridge.h @@ -0,0 +1,91 @@ +#ifndef __LINUX_BRIDGE_NETFILTER_H +#define __LINUX_BRIDGE_NETFILTER_H + +/* bridge-specific defines for netfilter. + */ + +/* Bridge Hooks */ +/* After promisc drops, checksum checks. */ +#define NF_BR_PRE_ROUTING 0 +/* If the packet is destined for this box. */ +#define NF_BR_LOCAL_IN 1 +/* If the packet is destined for another interface. */ +#define NF_BR_FORWARD 2 +/* Packets coming from a local process. */ +#define NF_BR_LOCAL_OUT 3 +/* Packets about to hit the wire. */ +#define NF_BR_POST_ROUTING 4 +/* Not really a hook, but used for the ebtables broute table */ +#define NF_BR_BROUTING 5 +#define NF_BR_NUMHOOKS 6 + +#ifdef __KERNEL__ +#include <linux/netfilter.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/if_pppox.h> + +enum nf_br_hook_priorities { + NF_BR_PRI_FIRST = INT_MIN, + NF_BR_PRI_NAT_DST_BRIDGED = -300, + NF_BR_PRI_FILTER_BRIDGED = -200, + NF_BR_PRI_BRNF = 0, + NF_BR_PRI_NAT_DST_OTHER = 100, + NF_BR_PRI_FILTER_OTHER = 200, + NF_BR_PRI_NAT_SRC = 300, + NF_BR_PRI_LAST = INT_MAX, +}; + +#ifdef CONFIG_BRIDGE_NETFILTER + +#define BRNF_PKT_TYPE 0x01 +#define BRNF_BRIDGED_DNAT 0x02 +#define BRNF_DONT_TAKE_PARENT 0x04 +#define BRNF_BRIDGED 0x08 +#define BRNF_NF_BRIDGE_PREROUTING 0x10 + + +/* Only used in br_forward.c */ +extern int nf_bridge_copy_header(struct sk_buff *skb); +static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) +{ + if (skb->nf_bridge && + skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) + return nf_bridge_copy_header(skb); + return 0; +} + +static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case __cpu_to_be16(ETH_P_8021Q): + return VLAN_HLEN; + case __cpu_to_be16(ETH_P_PPP_SES): + return PPPOE_SES_HLEN; + default: + return 0; + } +} + +/* This is called by the IP fragmenting code and it ensures there is + * enough room for the encapsulating header (if there is one). */ +static inline unsigned int nf_bridge_pad(const struct sk_buff *skb) +{ + if (skb->nf_bridge) + return nf_bridge_encap_header_len(skb); + return 0; +} + +struct bridge_skb_cb { + union { + __be32 ipv4; + } daddr; +}; + +#else +#define nf_bridge_maybe_copy_header(skb) (0) +#define nf_bridge_pad(skb) (0) +#endif /* CONFIG_BRIDGE_NETFILTER */ + +#endif /* __KERNEL__ */ +#endif diff --git a/tools/remus/imqebt/include/linux/netfilter_bridge/ebt_imq.h b/tools/remus/imqebt/include/linux/netfilter_bridge/ebt_imq.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/linux/netfilter_bridge/ebt_imq.h @@ -0,0 +1,8 @@ +#ifndef __LINUX_BRIDGE_EBT_IMQ_H +#define __LINUX_BRIDGE_EBT_IMQ_H + +struct ebt_imq_info +{ + unsigned int todev; +}; +#endif diff --git a/tools/remus/imqebt/include/linux/netfilter_bridge/ebtables.h b/tools/remus/imqebt/include/linux/netfilter_bridge/ebtables.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/linux/netfilter_bridge/ebtables.h @@ -0,0 +1,276 @@ +/* + * ebtables + * + * Authors: + * Bart De Schuymer <bdschuym@pandora.be> + * + * ebtables.c,v 2.0, April, 2002 + * + * This code is stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty'' Russell & Michael J. Neuling + */ + +/* Local copy of the kernel file, needed for Sparc64 support */ +#ifndef __LINUX_BRIDGE_EFF_H +#define __LINUX_BRIDGE_EFF_H +#include <linux/if.h> +#include <linux/netfilter_bridge.h> +#include <linux/if_ether.h> + +#define EBT_TABLE_MAXNAMELEN 32 +#define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN +#define EBT_FUNCTION_MAXNAMELEN EBT_TABLE_MAXNAMELEN + +/* verdicts >0 are "branches" */ +#define EBT_ACCEPT -1 +#define EBT_DROP -2 +#define EBT_CONTINUE -3 +#define EBT_RETURN -4 +#define NUM_STANDARD_TARGETS 4 +/* ebtables target modules store the verdict inside an int. We can + * reclaim a part of this int for backwards compatible extensions. + * The 4 lsb are more than enough to store the verdict. */ +#define EBT_VERDICT_BITS 0x0000000F + +struct ebt_counter +{ + uint64_t pcnt; + uint64_t bcnt; +}; + +struct ebt_replace +{ + char name[EBT_TABLE_MAXNAMELEN]; + unsigned int valid_hooks; + /* nr of rules in the table */ + unsigned int nentries; + /* total size of the entries */ + unsigned int entries_size; + /* start of the chains */ +#ifdef KERNEL_64_USERSPACE_32 + uint64_t hook_entry[NF_BR_NUMHOOKS]; +#else + struct ebt_entries *hook_entry[NF_BR_NUMHOOKS]; +#endif + /* nr of counters userspace expects back */ + unsigned int num_counters; + /* where the kernel will put the old counters */ +#ifdef KERNEL_64_USERSPACE_32 + uint64_t counters; + uint64_t entries; +#else + struct ebt_counter *counters; + char *entries; +#endif +}; + +struct ebt_entries { + /* this field is always set to zero + * See EBT_ENTRY_OR_ENTRIES. + * Must be same size as ebt_entry.bitmask */ + unsigned int distinguisher; + /* the chain name */ + char name[EBT_CHAIN_MAXNAMELEN]; + /* counter offset for this chain */ + unsigned int counter_offset; + /* one standard (accept, drop, return) per hook */ + int policy; + /* nr. of entries */ + unsigned int nentries; + /* entry list */ + char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); +}; + +/* used for the bitmask of struct ebt_entry */ + +/* This is a hack to make a difference between an ebt_entry struct and an + * ebt_entries struct when traversing the entries from start to end. + * Using this simplifies the code alot, while still being able to use + * ebt_entries. + * Contrary, iptables doesn''t use something like ebt_entries and therefore uses + * different techniques for naming the policy and such. So, iptables doesn''t + * need a hack like this. + */ +#define EBT_ENTRY_OR_ENTRIES 0x01 +/* these are the normal masks */ +#define EBT_NOPROTO 0x02 +#define EBT_802_3 0x04 +#define EBT_SOURCEMAC 0x08 +#define EBT_DESTMAC 0x10 +#define EBT_F_MASK (EBT_NOPROTO | EBT_802_3 | EBT_SOURCEMAC | EBT_DESTMAC \ + | EBT_ENTRY_OR_ENTRIES) + +#define EBT_IPROTO 0x01 +#define EBT_IIN 0x02 +#define EBT_IOUT 0x04 +#define EBT_ISOURCE 0x8 +#define EBT_IDEST 0x10 +#define EBT_ILOGICALIN 0x20 +#define EBT_ILOGICALOUT 0x40 +#define EBT_INV_MASK (EBT_IPROTO | EBT_IIN | EBT_IOUT | EBT_ILOGICALIN \ + | EBT_ILOGICALOUT | EBT_ISOURCE | EBT_IDEST) + +struct ebt_entry_match +{ + union { + char name[EBT_FUNCTION_MAXNAMELEN]; + struct ebt_match *match; + } u; + /* size of data */ + unsigned int match_size; +#ifdef KERNEL_64_USERSPACE_32 + unsigned int pad; +#endif + unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); +}; + +struct ebt_entry_watcher +{ + union { + char name[EBT_FUNCTION_MAXNAMELEN]; + struct ebt_watcher *watcher; + } u; + /* size of data */ + unsigned int watcher_size; +#ifdef KERNEL_64_USERSPACE_32 + unsigned int pad; +#endif + unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); +}; + +struct ebt_entry_target +{ + union { + char name[EBT_FUNCTION_MAXNAMELEN]; + struct ebt_target *target; + } u; + /* size of data */ + unsigned int target_size; +#ifdef KERNEL_64_USERSPACE_32 + unsigned int pad; +#endif + unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); +}; + +#define EBT_STANDARD_TARGET "standard" +struct ebt_standard_target +{ + struct ebt_entry_target target; + int verdict; +#ifdef KERNEL_64_USERSPACE_32 + unsigned int pad; +#endif +}; + +/* one entry */ +struct ebt_entry { + /* this needs to be the first field */ + unsigned int bitmask; + unsigned int invflags; + uint16_t ethproto; + /* the physical in-dev */ + char in[IFNAMSIZ]; + /* the logical in-dev */ + char logical_in[IFNAMSIZ]; + /* the physical out-dev */ + char out[IFNAMSIZ]; + /* the logical out-dev */ + char logical_out[IFNAMSIZ]; + unsigned char sourcemac[ETH_ALEN]; + unsigned char sourcemsk[ETH_ALEN]; + unsigned char destmac[ETH_ALEN]; + unsigned char destmsk[ETH_ALEN]; + /* sizeof ebt_entry + matches */ + unsigned int watchers_offset; + /* sizeof ebt_entry + matches + watchers */ + unsigned int target_offset; + /* sizeof ebt_entry + matches + watchers + target */ + unsigned int next_offset; + unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); +}; + +/* {g,s}etsockopt numbers */ +#define EBT_BASE_CTL 128 + +#define EBT_SO_SET_ENTRIES (EBT_BASE_CTL) +#define EBT_SO_SET_COUNTERS (EBT_SO_SET_ENTRIES+1) +#define EBT_SO_SET_MAX (EBT_SO_SET_COUNTERS+1) + +#define EBT_SO_GET_INFO (EBT_BASE_CTL) +#define EBT_SO_GET_ENTRIES (EBT_SO_GET_INFO+1) +#define EBT_SO_GET_INIT_INFO (EBT_SO_GET_ENTRIES+1) +#define EBT_SO_GET_INIT_ENTRIES (EBT_SO_GET_INIT_INFO+1) +#define EBT_SO_GET_MAX (EBT_SO_GET_INIT_ENTRIES+1) + +/* blatently stolen from ip_tables.h + * fn returns 0 to continue iteration */ +#define EBT_MATCH_ITERATE(e, fn, args...) \ +({ \ + unsigned int __i; \ + int __ret = 0; \ + struct ebt_entry_match *__match; \ + \ + for (__i = sizeof(struct ebt_entry); \ + __i < (e)->watchers_offset; \ + __i += __match->match_size + \ + sizeof(struct ebt_entry_match)) { \ + __match = (void *)(e) + __i; \ + \ + __ret = fn(__match , ## args); \ + if (__ret != 0) \ + break; \ + } \ + if (__ret == 0) { \ + if (__i != (e)->watchers_offset) \ + __ret = -EINVAL; \ + } \ + __ret; \ +}) + +#define EBT_WATCHER_ITERATE(e, fn, args...) \ +({ \ + unsigned int __i; \ + int __ret = 0; \ + struct ebt_entry_watcher *__watcher; \ + \ + for (__i = e->watchers_offset; \ + __i < (e)->target_offset; \ + __i += __watcher->watcher_size + \ + sizeof(struct ebt_entry_watcher)) { \ + __watcher = (void *)(e) + __i; \ + \ + __ret = fn(__watcher , ## args); \ + if (__ret != 0) \ + break; \ + } \ + if (__ret == 0) { \ + if (__i != (e)->target_offset) \ + __ret = -EINVAL; \ + } \ + __ret; \ +}) + +#define EBT_ENTRY_ITERATE(entries, size, fn, args...) \ +({ \ + unsigned int __i; \ + int __ret = 0; \ + struct ebt_entry *__entry; \ + \ + for (__i = 0; __i < (size);) { \ + __entry = (void *)(entries) + __i; \ + __ret = fn(__entry , ## args); \ + if (__ret != 0) \ + break; \ + if (__entry->bitmask != 0) \ + __i += __entry->next_offset; \ + else \ + __i += sizeof(struct ebt_entries); \ + } \ + if (__ret == 0) { \ + if (__i != (size)) \ + __ret = -EINVAL; \ + } \ + __ret; \ +}) + +#endif diff --git a/tools/remus/imqebt/include/linux/types.h b/tools/remus/imqebt/include/linux/types.h new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/include/linux/types.h @@ -0,0 +1,209 @@ +#ifndef _LINUX_TYPES_H +#define _LINUX_TYPES_H + +#include <asm/types.h> + +#ifndef __ASSEMBLY__ +#ifdef __KERNEL__ + +#define DECLARE_BITMAP(name,bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#endif + +#include <linux/posix_types.h> + +#ifdef __KERNEL__ + +typedef __u32 __kernel_dev_t; + +typedef __kernel_fd_set fd_set; +typedef __kernel_dev_t dev_t; +typedef __kernel_ino_t ino_t; +typedef __kernel_mode_t mode_t; +typedef __kernel_nlink_t nlink_t; +typedef __kernel_off_t off_t; +typedef __kernel_pid_t pid_t; +typedef __kernel_daddr_t daddr_t; +typedef __kernel_key_t key_t; +typedef __kernel_suseconds_t suseconds_t; +typedef __kernel_timer_t timer_t; +typedef __kernel_clockid_t clockid_t; +typedef __kernel_mqd_t mqd_t; + +typedef _Bool bool; + +typedef __kernel_uid32_t uid_t; +typedef __kernel_gid32_t gid_t; +typedef __kernel_uid16_t uid16_t; +typedef __kernel_gid16_t gid16_t; + +typedef unsigned long uintptr_t; + +#ifdef CONFIG_UID16 +/* This is defined by include/asm-{arch}/posix_types.h */ +typedef __kernel_old_uid_t old_uid_t; +typedef __kernel_old_gid_t old_gid_t; +#endif /* CONFIG_UID16 */ + +#if defined(__GNUC__) +typedef __kernel_loff_t loff_t; +#endif + +/* + * The following typedefs are also protected by individual ifdefs for + * historical reasons: + */ +#ifndef _SIZE_T +#define _SIZE_T +typedef __kernel_size_t size_t; +#endif + +#ifndef _SSIZE_T +#define _SSIZE_T +typedef __kernel_ssize_t ssize_t; +#endif + +#ifndef _PTRDIFF_T +#define _PTRDIFF_T +typedef __kernel_ptrdiff_t ptrdiff_t; +#endif + +#ifndef _TIME_T +#define _TIME_T +typedef __kernel_time_t time_t; +#endif + +#ifndef _CLOCK_T +#define _CLOCK_T +typedef __kernel_clock_t clock_t; +#endif + +#ifndef _CADDR_T +#define _CADDR_T +typedef __kernel_caddr_t caddr_t; +#endif + +/* bsd */ +typedef unsigned char u_char; +typedef unsigned short u_short; +typedef unsigned int u_int; +typedef unsigned long u_long; + +/* sysv */ +typedef unsigned char unchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; + +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ + +typedef __u8 u_int8_t; +typedef __s8 int8_t; +typedef __u16 u_int16_t; +typedef __s16 int16_t; +typedef __u32 u_int32_t; +typedef __s32 int32_t; + +#endif /* !(__BIT_TYPES_DEFINED__) */ + +typedef __u8 uint8_t; +typedef __u16 uint16_t; +typedef __u32 uint32_t; + +#if defined(__GNUC__) +typedef __u64 uint64_t; +typedef __u64 u_int64_t; +typedef __s64 int64_t; +#endif + +/* this is a special 64bit data type that is 8-byte aligned */ +#define aligned_u64 __u64 __attribute__((aligned(8))) +#define aligned_be64 __be64 __attribute__((aligned(8))) +#define aligned_le64 __le64 __attribute__((aligned(8))) + +/** + * The type used for indexing onto a disc or disc partition. + * + * Linux always considers sectors to be 512 bytes long independently + * of the devices real block size. + * + * blkcnt_t is the type of the inode''s block count. + */ +#ifdef CONFIG_LBD +typedef u64 sector_t; +typedef u64 blkcnt_t; +#else +typedef unsigned long sector_t; +typedef unsigned long blkcnt_t; +#endif + +/* + * The type of an index into the pagecache. Use a #define so asm/types.h + * can override it. + */ +#ifndef pgoff_t +#define pgoff_t unsigned long +#endif + +#endif /* __KERNEL__ */ + +/* + * Below are truly Linux-specific types that should never collide with + * any application/library that wants linux/types.h. + */ + +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#ifdef __CHECK_ENDIAN__ +#define __bitwise __bitwise__ +#else +#define __bitwise +#endif + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +typedef __u16 __bitwise __sum16; +typedef __u32 __bitwise __wsum; + +#ifdef __KERNEL__ +typedef unsigned __bitwise__ gfp_t; +typedef unsigned __bitwise__ fmode_t; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT +typedef u64 phys_addr_t; +#else +typedef u32 phys_addr_t; +#endif + +typedef phys_addr_t resource_size_t; + +typedef struct { + volatile int counter; +} atomic_t; + +#ifdef CONFIG_64BIT +typedef struct { + volatile long counter; +} atomic64_t; +#endif + +struct ustat { + __kernel_daddr_t f_tfree; + __kernel_ino_t f_tinode; + char f_fname[6]; + char f_fpack[6]; +}; + +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ +#endif /* _LINUX_TYPES_H */ diff --git a/tools/remus/imqebt/libebtc.c b/tools/remus/imqebt/libebtc.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/libebtc.c @@ -0,0 +1,1280 @@ +/* + * libebtc.c, January 2004 + * + * Contains the functions with which to make a table in userspace. + * + * Author: Bart De Schuymer + * + * This code is stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty'' Russell & Michael J. Neuling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include "include/ebtables_u.h" +#include "include/ethernetdb.h" +#include <unistd.h> +#include <fcntl.h> +#include <sys/wait.h> + +static void decrease_chain_jumps(struct ebt_u_replace *replace); +static int iterate_entries(struct ebt_u_replace *replace, int type); + +/* The standard names */ +const char *ebt_hooknames[NF_BR_NUMHOOKS] +{ + [NF_BR_PRE_ROUTING]"PREROUTING", + [NF_BR_LOCAL_IN]"INPUT", + [NF_BR_FORWARD]"FORWARD", + [NF_BR_LOCAL_OUT]"OUTPUT", + [NF_BR_POST_ROUTING]"POSTROUTING", + [NF_BR_BROUTING]"BROUTING" +}; + +/* The four target names */ +const char* ebt_standard_targets[NUM_STANDARD_TARGETS] +{ + "ACCEPT", + "DROP", + "CONTINUE", + "RETURN", +}; + +/* The lists of supported tables, matches, watchers and targets */ +struct ebt_u_table *ebt_tables; +struct ebt_u_match *ebt_matches; +struct ebt_u_watcher *ebt_watchers; +struct ebt_u_target *ebt_targets; + +/* Find the right structure belonging to a name */ +struct ebt_u_target *ebt_find_target(const char *name) +{ + struct ebt_u_target *t = ebt_targets; + + while (t && strcmp(t->name, name)) + t = t->next; + return t; +} + +struct ebt_u_match *ebt_find_match(const char *name) +{ + struct ebt_u_match *m = ebt_matches; + + while (m && strcmp(m->name, name)) + m = m->next; + return m; +} + +struct ebt_u_watcher *ebt_find_watcher(const char *name) +{ + struct ebt_u_watcher *w = ebt_watchers; + + while (w && strcmp(w->name, name)) + w = w->next; + return w; +} + +struct ebt_u_table *ebt_find_table(const char *name) +{ + struct ebt_u_table *t = ebt_tables; + + while (t && strcmp(t->name, name)) + t = t->next; + return t; +} + +/* Prints all registered extensions */ +void ebt_list_extensions() +{ + struct ebt_u_table *tbl = ebt_tables; + struct ebt_u_target *t = ebt_targets; + struct ebt_u_match *m = ebt_matches; + struct ebt_u_watcher *w = ebt_watchers; + + PRINT_VERSION; + printf("Loaded userspace extensions:\n\nLoaded tables:\n"); + while (tbl) { + printf("%s\n", tbl->name); + tbl = tbl->next; + } + printf("\nLoaded targets:\n"); + while (t) { + printf("%s\n", t->name); + t = t->next; + } + printf("\nLoaded matches:\n"); + while (m) { + printf("%s\n", m->name); + m = m->next; + } + printf("\nLoaded watchers:\n"); + while (w) { + printf("%s\n", w->name); + w = w->next; + } +} + +/* Get the table from the kernel or from a binary file + * init: 1 = ask the kernel for the initial contents of a table, i.e. the + * way it looks when the table is insmod''ed + * 0 = get the current data in the table */ +int ebt_get_kernel_table(struct ebt_u_replace *replace, int init) +{ + if (!ebt_find_table(replace->name)) { + ebt_print_error("Bad table name ''%s''", replace->name); + return -1; + } + /* Get the kernel''s information */ + if (ebt_get_table(replace, init)) { + if (ebt_errormsg[0] != ''\0'') + return -1; + ebtables_insmod("ebtables"); + if (ebt_get_table(replace, init)) { + ebt_print_error("The kernel doesn''t support the ebtables ''%s'' table", replace->name); + return -1; + } + } + return 0; +} + +/* Put sane values into a new entry */ +void ebt_initialize_entry(struct ebt_u_entry *e) +{ + e->bitmask = EBT_NOPROTO; + e->invflags = 0; + e->ethproto = 0; + strcpy(e->in, ""); + strcpy(e->out, ""); + strcpy(e->logical_in, ""); + strcpy(e->logical_out, ""); + e->m_list = NULL; + e->w_list = NULL; + e->t = (struct ebt_entry_target *)ebt_find_target(EBT_STANDARD_TARGET); + ebt_find_target(EBT_STANDARD_TARGET)->used = 1; + e->cnt.pcnt = e->cnt.bcnt = e->cnt_surplus.pcnt = e->cnt_surplus.bcnt = 0; + + if (!e->t) + ebt_print_bug("Couldn''t load standard target"); + ((struct ebt_standard_target *)((struct ebt_u_target *)e->t)->t)->verdict = EBT_CONTINUE; +} + +/* Free up the memory of the table held in userspace, *replace can be reused */ +void ebt_cleanup_replace(struct ebt_u_replace *replace) +{ + int i; + struct ebt_u_entries *entries; + struct ebt_cntchanges *cc1, *cc2; + struct ebt_u_entry *u_e1, *u_e2; + + replace->name[0] = ''\0''; + replace->valid_hooks = 0; + replace->nentries = 0; + replace->num_counters = 0; + replace->flags = 0; + replace->command = 0; + replace->selected_chain = -1; + free(replace->filename); + replace->filename = NULL; + free(replace->counters); + replace->counters = NULL; + + for (i = 0; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + u_e1 = entries->entries->next; + while (u_e1 != entries->entries) { + ebt_free_u_entry(u_e1); + u_e2 = u_e1->next; + free(u_e1); + u_e1 = u_e2; + } + free(entries->entries); + free(entries); + replace->chains[i] = NULL; + } + cc1 = replace->cc->next; + while (cc1 != replace->cc) { + cc2 = cc1->next; + free(cc1); + cc1 = cc2; + } + replace->cc->next = replace->cc->prev = replace->cc; +} + +/* Should be called, e.g., between 2 rule adds */ +void ebt_reinit_extensions() +{ + struct ebt_u_match *m; + struct ebt_u_watcher *w; + struct ebt_u_target *t; + int size; + + /* The init functions should determine by themselves whether they are + * called for the first time or not (when necessary). */ + for (m = ebt_matches; m; m = m->next) { + if (m->used) { + size = EBT_ALIGN(m->size) + sizeof(struct ebt_entry_match); + m->m = (struct ebt_entry_match *)malloc(size); + if (!m->m) + ebt_print_memory(); + strcpy(m->m->u.name, m->name); + m->m->match_size = EBT_ALIGN(m->size); + m->used = 0; + } + m->flags = 0; /* An error can occur before used is set, while flags is changed. */ + m->init(m->m); + } + for (w = ebt_watchers; w; w = w->next) { + if (w->used) { + size = EBT_ALIGN(w->size) + sizeof(struct ebt_entry_watcher); + w->w = (struct ebt_entry_watcher *)malloc(size); + if (!w->w) + ebt_print_memory(); + strcpy(w->w->u.name, w->name); + w->w->watcher_size = EBT_ALIGN(w->size); + w->used = 0; + } + w->flags = 0; + w->init(w->w); + } + for (t = ebt_targets; t; t = t->next) { + if (t->used) { + size = EBT_ALIGN(t->size) + sizeof(struct ebt_entry_target); + t->t = (struct ebt_entry_target *)malloc(size); + if (!t->t) + ebt_print_memory(); + strcpy(t->t->u.name, t->name); + t->t->target_size = EBT_ALIGN(t->size); + t->used = 0; + } + t->flags = 0; + t->init(t->t); + } +} + +/* This doesn''t free e, because the calling function might need e->next */ +void ebt_free_u_entry(struct ebt_u_entry *e) +{ + struct ebt_u_match_list *m_l, *m_l2; + struct ebt_u_watcher_list *w_l, *w_l2; + + m_l = e->m_list; + while (m_l) { + m_l2 = m_l->next; + free(m_l->m); + free(m_l); + m_l = m_l2; + } + w_l = e->w_list; + while (w_l) { + w_l2 = w_l->next; + free(w_l->w); + free(w_l); + w_l = w_l2; + } + free(e->t); +} + +static char *get_modprobe(void) +{ + int procfile; + char *ret; + + procfile = open(PROC_SYS_MODPROBE, O_RDONLY); + if (procfile < 0) + return NULL; + + ret = malloc(1024); + if (ret) { + if (read(procfile, ret, 1024) == -1) + goto fail; + /* The kernel adds a ''\n'' */ + ret[1023] = ''\n''; + *strchr(ret, ''\n'') = ''\0''; + close(procfile); + return ret; + } + fail: + free(ret); + close(procfile); + return NULL; +} + +char *ebt_modprobe; +/* Try to load the kernel module, analogous to ip_tables.c */ +int ebtables_insmod(const char *modname) +{ + char *buf = NULL; + char *argv[3]; + + /* If they don''t explicitly set it, read out of /proc */ + if (!ebt_modprobe) { + buf = get_modprobe(); + if (!buf) + return -1; + ebt_modprobe = buf; /* Keep the value for possible later use */ + } + + switch (fork()) { + case 0: + argv[0] = (char *)ebt_modprobe; + argv[1] = (char *)modname; + argv[2] = NULL; + execv(argv[0], argv); + + /* Not usually reached */ + exit(0); + case -1: + return -1; + + default: /* Parent */ + wait(NULL); + } + + return 0; +} + +/* Parse the chain name and return a pointer to the chain base. + * Returns NULL on failure. */ +struct ebt_u_entries *ebt_name_to_chain(const struct ebt_u_replace *replace, const char* arg) +{ + int i; + struct ebt_u_entries *chain; + + for (i = 0; i < replace->num_chains; i++) { + if (!(chain = replace->chains[i])) + continue; + if (!strcmp(arg, chain->name)) + return chain; + } + return NULL; +} + +/* Parse the chain name and return the corresponding chain nr + * returns -1 on failure */ +int ebt_get_chainnr(const struct ebt_u_replace *replace, const char* arg) +{ + int i; + + for (i = 0; i < replace->num_chains; i++) { + if (!replace->chains[i]) + continue; + if (!strcmp(arg, replace->chains[i]->name)) + return i; + } + return -1; +} + + /* +************ +************ +**COMMANDS** +************ +************ + */ + +/* Change the policy of selected_chain. + * Handing a bad policy to this function is a bug. */ +void ebt_change_policy(struct ebt_u_replace *replace, int policy) +{ + struct ebt_u_entries *entries = ebt_to_chain(replace); + + if (policy < -NUM_STANDARD_TARGETS || policy == EBT_CONTINUE) + ebt_print_bug("Wrong policy: %d", policy); + entries->policy = policy; +} + +void ebt_delete_cc(struct ebt_cntchanges *cc) +{ + if (cc->type == CNT_ADD) { + cc->prev->next = cc->next; + cc->next->prev = cc->prev; + free(cc); + } + cc->type = CNT_DEL; +} + +void ebt_empty_chain(struct ebt_u_entries *entries) +{ + struct ebt_u_entry *u_e = entries->entries->next, *tmp; + while (u_e != entries->entries) { + ebt_delete_cc(u_e->cc); + ebt_free_u_entry(u_e); + tmp = u_e->next; + free(u_e); + u_e = tmp; + } + entries->entries->next = entries->entries->prev = entries->entries; + entries->nentries = 0; +} + +/* Flush one chain or the complete table + * If selected_chain == -1 then flush the complete table */ +void ebt_flush_chains(struct ebt_u_replace *replace) +{ + int i, numdel; + struct ebt_u_entries *entries = ebt_to_chain(replace); + + /* Flush whole table */ + if (!entries) { + if (replace->nentries == 0) + return; + replace->nentries = 0; + + /* Free everything and zero (n)entries */ + for (i = 0; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + entries->counter_offset = 0; + ebt_empty_chain(entries); + } + return; + } + + if (entries->nentries == 0) + return; + replace->nentries -= entries->nentries; + numdel = entries->nentries; + + /* Update counter_offset */ + for (i = replace->selected_chain+1; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + entries->counter_offset -= numdel; + } + + entries = ebt_to_chain(replace); + ebt_empty_chain(entries); +} + +#define OPT_COUNT 0x1000 /* This value is also defined in ebtables.c */ +/* Returns the rule number on success (starting from 0), -1 on failure + * + * This function expects the ebt_{match,watcher,target} members of new_entry + * to contain pointers to ebt_u_{match,watcher,target} */ +int ebt_check_rule_exists(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry) +{ + struct ebt_u_entry *u_e; + struct ebt_u_match_list *m_l, *m_l2; + struct ebt_u_match *m; + struct ebt_u_watcher_list *w_l, *w_l2; + struct ebt_u_watcher *w; + struct ebt_u_target *t = (struct ebt_u_target *)new_entry->t; + struct ebt_u_entries *entries = ebt_to_chain(replace); + int i, j, k; + + u_e = entries->entries->next; + /* Check for an existing rule (if there are duplicate rules, + * take the first occurance) */ + for (i = 0; i < entries->nentries; i++, u_e = u_e->next) { + if (u_e->ethproto != new_entry->ethproto) + continue; + if (strcmp(u_e->in, new_entry->in)) + continue; + if (strcmp(u_e->out, new_entry->out)) + continue; + if (strcmp(u_e->logical_in, new_entry->logical_in)) + continue; + if (strcmp(u_e->logical_out, new_entry->logical_out)) + continue; + if (new_entry->bitmask & EBT_SOURCEMAC && + memcmp(u_e->sourcemac, new_entry->sourcemac, ETH_ALEN)) + continue; + if (new_entry->bitmask & EBT_DESTMAC && + memcmp(u_e->destmac, new_entry->destmac, ETH_ALEN)) + continue; + if (new_entry->bitmask != u_e->bitmask || + new_entry->invflags != u_e->invflags) + continue; + if (replace->flags & OPT_COUNT && (new_entry->cnt.pcnt !+ u_e->cnt.pcnt || new_entry->cnt.bcnt != u_e->cnt.bcnt)) + continue; + /* Compare all matches */ + m_l = new_entry->m_list; + j = 0; + while (m_l) { + m = (struct ebt_u_match *)(m_l->m); + m_l2 = u_e->m_list; + while (m_l2 && strcmp(m_l2->m->u.name, m->m->u.name)) + m_l2 = m_l2->next; + if (!m_l2 || !m->compare(m->m, m_l2->m)) + goto letscontinue; + j++; + m_l = m_l->next; + } + /* Now be sure they have the same nr of matches */ + k = 0; + m_l = u_e->m_list; + while (m_l) { + k++; + m_l = m_l->next; + } + if (j != k) + continue; + + /* Compare all watchers */ + w_l = new_entry->w_list; + j = 0; + while (w_l) { + w = (struct ebt_u_watcher *)(w_l->w); + w_l2 = u_e->w_list; + while (w_l2 && strcmp(w_l2->w->u.name, w->w->u.name)) + w_l2 = w_l2->next; + if (!w_l2 || !w->compare(w->w, w_l2->w)) + goto letscontinue; + j++; + w_l = w_l->next; + } + k = 0; + w_l = u_e->w_list; + while (w_l) { + k++; + w_l = w_l->next; + } + if (j != k) + continue; + if (strcmp(t->t->u.name, u_e->t->u.name)) + continue; + if (!t->compare(t->t, u_e->t)) + continue; + return i; +letscontinue:; + } + return -1; +} + +/* Add a rule, rule_nr is the rule to update + * rule_nr specifies where the rule should be inserted + * rule_nr > 0 : insert the rule right before the rule_nr''th rule + * (the first rule is rule 1) + * rule_nr < 0 : insert the rule right before the (n+rule_nr+1)''th rule, + * where n denotes the number of rules in the chain + * rule_nr == 0: add a new rule at the end of the chain + * + * This function expects the ebt_{match,watcher,target} members of new_entry + * to contain pointers to ebt_u_{match,watcher,target} and updates these + * pointers so that they point to ebt_{match,watcher,target}, before adding + * the rule to the chain. Don''t free() the ebt_{match,watcher,target} and + * don''t reuse the new_entry after a successful call to ebt_add_rule() */ +void ebt_add_rule(struct ebt_u_replace *replace, struct ebt_u_entry *new_entry, int rule_nr) +{ + int i; + struct ebt_u_entry *u_e; + struct ebt_u_match_list *m_l; + struct ebt_u_watcher_list *w_l; + struct ebt_u_entries *entries = ebt_to_chain(replace); + struct ebt_cntchanges *cc, *new_cc; + + if (rule_nr <= 0) + rule_nr += entries->nentries; + else + rule_nr--; + if (rule_nr > entries->nentries || rule_nr < 0) { + ebt_print_error("The specified rule number is incorrect"); + return; + } + /* Go to the right position in the chain */ + if (rule_nr == entries->nentries) + u_e = entries->entries; + else { + u_e = entries->entries->next; + for (i = 0; i < rule_nr; i++) + u_e = u_e->next; + } + /* We''re adding one rule */ + replace->nentries++; + entries->nentries++; + /* Insert the rule */ + new_entry->next = u_e; + new_entry->prev = u_e->prev; + u_e->prev->next = new_entry; + u_e->prev = new_entry; + new_cc = (struct ebt_cntchanges *)malloc(sizeof(struct ebt_cntchanges)); + if (!new_cc) + ebt_print_memory(); + new_cc->type = CNT_ADD; + new_cc->change = 0; + if (new_entry->next == entries->entries) { + for (i = replace->selected_chain+1; i < replace->num_chains; i++) + if (!replace->chains[i] || replace->chains[i]->nentries == 0) + continue; + else + break; + if (i == replace->num_chains) + cc = replace->cc; + else + cc = replace->chains[i]->entries->next->cc; + } else + cc = new_entry->next->cc; + new_cc->next = cc; + new_cc->prev = cc->prev; + cc->prev->next = new_cc; + cc->prev = new_cc; + new_entry->cc = new_cc; + + /* Put the ebt_{match, watcher, target} pointers in place */ + m_l = new_entry->m_list; + while (m_l) { + m_l->m = ((struct ebt_u_match *)m_l->m)->m; + m_l = m_l->next; + } + w_l = new_entry->w_list; + while (w_l) { + w_l->w = ((struct ebt_u_watcher *)w_l->w)->w; + w_l = w_l->next; + } + new_entry->t = ((struct ebt_u_target *)new_entry->t)->t; + /* Update the counter_offset of chains behind this one */ + for (i = replace->selected_chain+1; i < replace->num_chains; i++) { + entries = replace->chains[i]; + if (!(entries = replace->chains[i])) + continue; + entries->counter_offset++; + } +} + +/* If *begin==*end==0 then find the rule corresponding to new_entry, + * else make the rule numbers positive (starting from 0) and check + * for bad rule numbers. */ +static int check_and_change_rule_number(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry, int *begin, int *end) +{ + struct ebt_u_entries *entries = ebt_to_chain(replace); + + if (*begin < 0) + *begin += entries->nentries + 1; + if (*end < 0) + *end += entries->nentries + 1; + + if (*begin < 0 || *begin > *end || *end > entries->nentries) { + ebt_print_error("Sorry, wrong rule numbers"); + return -1; + } + + if ((*begin * *end == 0) && (*begin + *end != 0)) + ebt_print_bug("begin and end should be either both zero, " + "either both non-zero"); + if (*begin != 0) { + (*begin)--; + (*end)--; + } else { + *begin = ebt_check_rule_exists(replace, new_entry); + *end = *begin; + if (*begin == -1) { + ebt_print_error("Sorry, rule does not exist"); + return -1; + } + } + return 0; +} + +/* Delete a rule or rules + * begin == end == 0: delete the rule corresponding to new_entry + * + * The first rule has rule nr 1, the last rule has rule nr -1, etc. + * This function expects the ebt_{match,watcher,target} members of new_entry + * to contain pointers to ebt_u_{match,watcher,target}. */ +void ebt_delete_rule(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry, int begin, int end) +{ + int i, nr_deletes; + struct ebt_u_entry *u_e, *u_e2, *u_e3; + struct ebt_u_entries *entries = ebt_to_chain(replace); + + if (check_and_change_rule_number(replace, new_entry, &begin, &end)) + return; + /* We''re deleting rules */ + nr_deletes = end - begin + 1; + replace->nentries -= nr_deletes; + entries->nentries -= nr_deletes; + /* Go to the right position in the chain */ + u_e = entries->entries->next; + for (i = 0; i < begin; i++) + u_e = u_e->next; + u_e3 = u_e->prev; + /* Remove the rules */ + for (i = 0; i < nr_deletes; i++) { + u_e2 = u_e; + ebt_delete_cc(u_e2->cc); + u_e = u_e->next; + /* Free everything */ + ebt_free_u_entry(u_e2); + free(u_e2); + } + u_e3->next = u_e; + u_e->prev = u_e3; + /* Update the counter_offset of chains behind this one */ + for (i = replace->selected_chain+1; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + entries->counter_offset -= nr_deletes; + } +} + +/* Change the counters of a rule or rules + * begin == end == 0: change counters of the rule corresponding to new_entry + * + * The first rule has rule nr 1, the last rule has rule nr -1, etc. + * This function expects the ebt_{match,watcher,target} members of new_entry + * to contain pointers to ebt_u_{match,watcher,target}. + * The mask denotes the following: + * pcnt: mask % 3 = 0 : change; = 1: increment; = 2: decrement + * bcnt: mask / 3 = 0 : change; = 1: increment = 2: increment + * In daemon mode, mask==0 must hold */ +void ebt_change_counters(struct ebt_u_replace *replace, + struct ebt_u_entry *new_entry, int begin, int end, + struct ebt_counter *cnt, int mask) +{ + int i; + struct ebt_u_entry *u_e; + struct ebt_u_entries *entries = ebt_to_chain(replace); + + if (check_and_change_rule_number(replace, new_entry, &begin, &end)) + return; + u_e = entries->entries->next; + for (i = 0; i < begin; i++) + u_e = u_e->next; + for (i = end-begin+1; i > 0; i--) { + if (mask % 3 == 0) { + u_e->cnt.pcnt = (*cnt).pcnt; + u_e->cnt_surplus.pcnt = 0; + } else { +#ifdef EBT_DEBUG + if (u_e->cc->type != CNT_NORM) + ebt_print_bug("cc->type != CNT_NORM"); +#endif + u_e->cnt_surplus.pcnt = (*cnt).pcnt; + } + + if (mask / 3 == 0) { + u_e->cnt.bcnt = (*cnt).bcnt; + u_e->cnt_surplus.bcnt = 0; + } else { +#ifdef EBT_DEBUG + if (u_e->cc->type != CNT_NORM) + ebt_print_bug("cc->type != CNT_NORM"); +#endif + u_e->cnt_surplus.bcnt = (*cnt).bcnt; + } + if (u_e->cc->type != CNT_ADD) + u_e->cc->type = CNT_CHANGE; + u_e->cc->change = mask; + u_e = u_e->next; + } +} + +/* If selected_chain == -1 then zero all counters, + * otherwise, zero the counters of selected_chain */ +void ebt_zero_counters(struct ebt_u_replace *replace) +{ + struct ebt_u_entries *entries = ebt_to_chain(replace); + struct ebt_u_entry *next; + int i; + + if (!entries) { + for (i = 0; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + next = entries->entries->next; + while (next != entries->entries) { + if (next->cc->type == CNT_NORM) + next->cc->type = CNT_CHANGE; + next->cnt.bcnt = next->cnt.pcnt = 0; + next->cc->change = 0; + next = next->next; + } + } + } else { + if (entries->nentries == 0) + return; + + next = entries->entries->next; + while (next != entries->entries) { + if (next->cc->type == CNT_NORM) + next->cc->type = CNT_CHANGE; + next->cnt.bcnt = next->cnt.pcnt = 0; + next = next->next; + } + } +} + +/* Add a new chain and specify its policy */ +void ebt_new_chain(struct ebt_u_replace *replace, const char *name, int policy) +{ + struct ebt_u_entries *new; + + if (replace->num_chains == replace->max_chains) + ebt_double_chains(replace); + new = (struct ebt_u_entries *)malloc(sizeof(struct ebt_u_entries)); + if (!new) + ebt_print_memory(); + replace->chains[replace->num_chains++] = new; + new->nentries = 0; + new->policy = policy; + new->counter_offset = replace->nentries; + new->hook_mask = 0; + strcpy(new->name, name); + new->entries = (struct ebt_u_entry *)malloc(sizeof(struct ebt_u_entry)); + if (!new->entries) + ebt_print_memory(); + new->entries->next = new->entries->prev = new->entries; + new->kernel_start = NULL; +} + +/* returns -1 if the chain is referenced, 0 on success */ +static int ebt_delete_a_chain(struct ebt_u_replace *replace, int chain, int print_err) +{ + int tmp = replace->selected_chain; + /* If the chain is referenced, don''t delete it, + * also decrement jumps to a chain behind the + * one we''re deleting */ + replace->selected_chain = chain; + if (ebt_check_for_references(replace, print_err)) + return -1; + decrease_chain_jumps(replace); + ebt_flush_chains(replace); + replace->selected_chain = tmp; + free(replace->chains[chain]->entries); + free(replace->chains[chain]); + memmove(replace->chains+chain, replace->chains+chain+1, (replace->num_chains-chain-1)*sizeof(void *)); + replace->num_chains--; + return 0; +} + +/* Selected_chain == -1: delete all non-referenced udc + * selected_chain < NF_BR_NUMHOOKS is illegal */ +void ebt_delete_chain(struct ebt_u_replace *replace) +{ + if (replace->selected_chain != -1 && replace->selected_chain < NF_BR_NUMHOOKS) + ebt_print_bug("You can''t remove a standard chain"); + if (replace->selected_chain == -1) { + int i = NF_BR_NUMHOOKS; + + while (i < replace->num_chains) + if (ebt_delete_a_chain(replace, i, 0)) + i++; + } else + ebt_delete_a_chain(replace, replace->selected_chain, 1); +} + +/* Rename an existing chain. */ +void ebt_rename_chain(struct ebt_u_replace *replace, const char *name) +{ + struct ebt_u_entries *entries = ebt_to_chain(replace); + + if (!entries) + ebt_print_bug("ebt_rename_chain: entries == NULL"); + strcpy(entries->name, name); +} + + + /* +************************* +************************* +**SPECIALIZED*FUNCTIONS** +************************* +************************* + */ + + +void ebt_double_chains(struct ebt_u_replace *replace) +{ + struct ebt_u_entries **new; + + replace->max_chains *= 2; + new = (struct ebt_u_entries **)malloc(replace->max_chains*sizeof(void *)); + if (!new) + ebt_print_memory(); + memcpy(new, replace->chains, replace->max_chains/2*sizeof(void *)); + free(replace->chains); + replace->chains = new; +} + +/* Executes the final_check() function for all extensions used by the rule + * ebt_check_for_loops should have been executed earlier, to make sure the + * hook_mask is correct. The time argument to final_check() is set to 1, + * meaning it''s the second time the final_check() function is executed. */ +void ebt_do_final_checks(struct ebt_u_replace *replace, struct ebt_u_entry *e, + struct ebt_u_entries *entries) +{ + struct ebt_u_match_list *m_l; + struct ebt_u_watcher_list *w_l; + struct ebt_u_target *t; + struct ebt_u_match *m; + struct ebt_u_watcher *w; + + m_l = e->m_list; + w_l = e->w_list; + while (m_l) { + m = ebt_find_match(m_l->m->u.name); + m->final_check(e, m_l->m, replace->name, + entries->hook_mask, 1); + if (ebt_errormsg[0] != ''\0'') + return; + m_l = m_l->next; + } + while (w_l) { + w = ebt_find_watcher(w_l->w->u.name); + w->final_check(e, w_l->w, replace->name, + entries->hook_mask, 1); + if (ebt_errormsg[0] != ''\0'') + return; + w_l = w_l->next; + } + t = ebt_find_target(e->t->u.name); + t->final_check(e, e->t, replace->name, + entries->hook_mask, 1); +} + +/* Returns 1 (if it returns) when the chain is referenced, 0 when it isn''t. + * print_err: 0 (resp. 1) = don''t (resp. do) print error when referenced */ +int ebt_check_for_references(struct ebt_u_replace *replace, int print_err) +{ + if (print_err) + return iterate_entries(replace, 1); + else + return iterate_entries(replace, 2); +} + +/* chain_nr: nr of the udc (>= NF_BR_NUMHOOKS) + * Returns 1 (if it returns) when the chain is referenced, 0 when it isn''t. + * print_err: 0 (resp. 1) = don''t (resp. do) print error when referenced */ +int ebt_check_for_references2(struct ebt_u_replace *replace, int chain_nr, + int print_err) +{ + int tmp = replace->selected_chain, ret; + + replace->selected_chain = chain_nr; + if (print_err) + ret = iterate_entries(replace, 1); + else + ret = iterate_entries(replace, 2); + replace->selected_chain = tmp; + return ret; +} + +struct ebt_u_stack +{ + int chain_nr; + int n; + struct ebt_u_entry *e; + struct ebt_u_entries *entries; +}; + +/* Checks for loops + * As a by-product, the hook_mask member of each chain is filled in + * correctly. The check functions of the extensions need this hook_mask + * to know from which standard chains they can be called. */ +void ebt_check_for_loops(struct ebt_u_replace *replace) +{ + int chain_nr , i, j , k, sp = 0, verdict; + struct ebt_u_entries *entries, *entries2; + struct ebt_u_stack *stack = NULL; + struct ebt_u_entry *e; + + /* Initialize hook_mask to 0 */ + for (i = 0; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + if (i < NF_BR_NUMHOOKS) + /* (1 << NF_BR_NUMHOOKS) implies it''s a standard chain + * (usefull in the final_check() funtions) */ + entries->hook_mask = (1 << i) | (1 << NF_BR_NUMHOOKS); + else + entries->hook_mask = 0; + } + if (replace->num_chains == NF_BR_NUMHOOKS) + return; + stack = (struct ebt_u_stack *)malloc((replace->num_chains - NF_BR_NUMHOOKS) * sizeof(struct ebt_u_stack)); + if (!stack) + ebt_print_memory(); + + /* Check for loops, starting from every base chain */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if (!(entries = replace->chains[i])) + continue; + chain_nr = i; + + e = entries->entries->next; + for (j = 0; j < entries->nentries; j++) { + if (strcmp(e->t->u.name, EBT_STANDARD_TARGET)) + goto letscontinue; + verdict = ((struct ebt_standard_target *)(e->t))->verdict; + if (verdict < 0) + goto letscontinue; + /* Now see if we''ve been here before */ + for (k = 0; k < sp; k++) + if (stack[k].chain_nr == verdict + NF_BR_NUMHOOKS) { + ebt_print_error("Loop from chain ''%s'' to chain ''%s''", + replace->chains[chain_nr]->name, + replace->chains[stack[k].chain_nr]->name); + goto free_stack; + } + entries2 = replace->chains[verdict + NF_BR_NUMHOOKS]; + /* check if we''ve dealt with this chain already */ + if (entries2->hook_mask & (1<<i)) + goto letscontinue; + entries2->hook_mask |= entries->hook_mask; + /* Jump to the chain, make sure we know how to get back */ + stack[sp].chain_nr = chain_nr; + stack[sp].n = j; + stack[sp].entries = entries; + stack[sp].e = e; + sp++; + j = -1; + e = entries2->entries->next; + chain_nr = verdict + NF_BR_NUMHOOKS; + entries = entries2; + continue; +letscontinue: + e = e->next; + } + /* We are at the end of a standard chain */ + if (sp == 0) + continue; + /* Go back to the chain one level higher */ + sp--; + j = stack[sp].n; + chain_nr = stack[sp].chain_nr; + e = stack[sp].e; + entries = stack[sp].entries; + goto letscontinue; + } +free_stack: + free(stack); + return; +} + +/* The user will use the match, so put it in new_entry. The ebt_u_match + * pointer is put in the ebt_entry_match pointer. ebt_add_rule will + * fill in the final value for new->m. Unless the rule is added to a chain, + * the pointer will keep pointing to the ebt_u_match (until the new_entry + * is freed). I know, I should use a union for these 2 pointer types... */ +void ebt_add_match(struct ebt_u_entry *new_entry, struct ebt_u_match *m) +{ + struct ebt_u_match_list **m_list, *new; + + for (m_list = &new_entry->m_list; *m_list; m_list = &(*m_list)->next); + new = (struct ebt_u_match_list *) + malloc(sizeof(struct ebt_u_match_list)); + if (!new) + ebt_print_memory(); + *m_list = new; + new->next = NULL; + new->m = (struct ebt_entry_match *)m; +} + +void ebt_add_watcher(struct ebt_u_entry *new_entry, struct ebt_u_watcher *w) +{ + struct ebt_u_watcher_list **w_list; + struct ebt_u_watcher_list *new; + + for (w_list = &new_entry->w_list; *w_list; w_list = &(*w_list)->next); + new = (struct ebt_u_watcher_list *) + malloc(sizeof(struct ebt_u_watcher_list)); + if (!new) + ebt_print_memory(); + *w_list = new; + new->next = NULL; + new->w = (struct ebt_entry_watcher *)w; +} + + + /* +******************* +******************* +**OTHER*FUNCTIONS** +******************* +******************* + */ + + +/* type = 0 => update chain jumps + * type = 1 => check for reference, print error when referenced + * type = 2 => check for reference, don''t print error when referenced + * + * Returns 1 when type == 1 and the chain is referenced + * returns 0 otherwise */ +static int iterate_entries(struct ebt_u_replace *replace, int type) +{ + int i, j, chain_nr = replace->selected_chain - NF_BR_NUMHOOKS; + struct ebt_u_entries *entries; + struct ebt_u_entry *e; + + if (chain_nr < 0) + ebt_print_bug("iterate_entries: udc = %d < 0", chain_nr); + for (i = 0; i < replace->num_chains; i++) { + if (!(entries = replace->chains[i])) + continue; + e = entries->entries->next; + for (j = 0; j < entries->nentries; j++) { + int chain_jmp; + + if (strcmp(e->t->u.name, EBT_STANDARD_TARGET)) { + e = e->next; + continue; + } + chain_jmp = ((struct ebt_standard_target *)e->t)-> + verdict; + switch (type) { + case 1: + case 2: + if (chain_jmp == chain_nr) { + if (type == 2) + return 1; + ebt_print_error("Can''t delete the chain ''%s'', it''s referenced in chain ''%s'', rule %d", + replace->chains[chain_nr + NF_BR_NUMHOOKS]->name, entries->name, j); + return 1; + } + break; + case 0: + /* Adjust the chain jumps when necessary */ + if (chain_jmp > chain_nr) + ((struct ebt_standard_target *)e->t)->verdict--; + break; + } /* End switch */ + e = e->next; + } + } + return 0; +} + +static void decrease_chain_jumps(struct ebt_u_replace *replace) +{ + iterate_entries(replace, 0); +} + +/* Used in initialization code of modules */ +void ebt_register_match(struct ebt_u_match *m) +{ + int size = EBT_ALIGN(m->size) + sizeof(struct ebt_entry_match); + struct ebt_u_match **i; + + m->m = (struct ebt_entry_match *)malloc(size); + if (!m->m) + ebt_print_memory(); + strcpy(m->m->u.name, m->name); + m->m->match_size = EBT_ALIGN(m->size); + m->init(m->m); + + for (i = &ebt_matches; *i; i = &((*i)->next)); + m->next = NULL; + *i = m; +} + +void ebt_register_watcher(struct ebt_u_watcher *w) +{ + int size = EBT_ALIGN(w->size) + sizeof(struct ebt_entry_watcher); + struct ebt_u_watcher **i; + + w->w = (struct ebt_entry_watcher *)malloc(size); + if (!w->w) + ebt_print_memory(); + strcpy(w->w->u.name, w->name); + w->w->watcher_size = EBT_ALIGN(w->size); + w->init(w->w); + + for (i = &ebt_watchers; *i; i = &((*i)->next)); + w->next = NULL; + *i = w; +} + +void ebt_register_target(struct ebt_u_target *t) +{ + int size = EBT_ALIGN(t->size) + sizeof(struct ebt_entry_target); + struct ebt_u_target **i; + + t->t = (struct ebt_entry_target *)malloc(size); + if (!t->t) + ebt_print_memory(); + strcpy(t->t->u.name, t->name); + t->t->target_size = EBT_ALIGN(t->size); + t->init(t->t); + + for (i = &ebt_targets; *i; i = &((*i)->next)); + t->next = NULL; + *i = t; +} + +void ebt_register_table(struct ebt_u_table *t) +{ + t->next = ebt_tables; + ebt_tables = t; +} + +void ebt_iterate_matches(void (*f)(struct ebt_u_match *)) +{ + struct ebt_u_match *i; + + for (i = ebt_matches; i; i = i->next) + f(i); +} + +void ebt_iterate_watchers(void (*f)(struct ebt_u_watcher *)) +{ + struct ebt_u_watcher *i; + + for (i = ebt_watchers; i; i = i->next) + f(i); +} + +void ebt_iterate_targets(void (*f)(struct ebt_u_target *)) +{ + struct ebt_u_target *i; + + for (i = ebt_targets; i; i = i->next) + f(i); +} + +/* Don''t use this function, use ebt_print_bug() */ +void __ebt_print_bug(char *file, int line, char *format, ...) +{ + va_list l; + + va_start(l, format); + fprintf(stderr, PROGNAME" v"PROGVERSION":%s:%d:--BUG--: \n", file, line); + vfprintf(stderr, format, l); + fprintf(stderr, "\n"); + va_end(l); + exit (-1); +} + +/* The error messages are put in here when ebt_silent == 1 + * ebt_errormsg[0] == ''\0'' implies there was no error */ +char ebt_errormsg[ERRORMSG_MAXLEN]; +/* When error messages should not be printed on the screen, after which + * the program exit()s, set ebt_silent to 1. */ +int ebt_silent; +/* Don''t use this function, use ebt_print_error() */ +void __ebt_print_error(char *format, ...) +{ + va_list l; + + va_start(l, format); + if (ebt_silent && ebt_errormsg[0] == ''\0'') { + vsnprintf(ebt_errormsg, ERRORMSG_MAXLEN, format, l); + va_end(l); + } else { + vfprintf(stderr, format, l); + fprintf(stderr, ".\n"); + va_end(l); + exit (-1); + } +} diff --git a/tools/remus/imqebt/useful_functions.c b/tools/remus/imqebt/useful_functions.c new file mode 100644 --- /dev/null +++ b/tools/remus/imqebt/useful_functions.c @@ -0,0 +1,413 @@ +/* + * useful_functions.c, January 2004 + * + * Random collection of functions that can be used by extensions. + * + * Author: Bart De Schuymer + * + * This code is stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty'' Russell & Michael J. Neuling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include "include/ebtables_u.h" +#include "include/ethernetdb.h" +#include <stdio.h> +#include <netinet/ether.h> +#include <string.h> +#include <stdlib.h> +#include <getopt.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <arpa/inet.h> + +const unsigned char mac_type_unicast[ETH_ALEN] = {0,0,0,0,0,0}; +const unsigned char msk_type_unicast[ETH_ALEN] = {1,0,0,0,0,0}; +const unsigned char mac_type_multicast[ETH_ALEN] = {1,0,0,0,0,0}; +const unsigned char msk_type_multicast[ETH_ALEN] = {1,0,0,0,0,0}; +const unsigned char mac_type_broadcast[ETH_ALEN] = {255,255,255,255,255,255}; +const unsigned char msk_type_broadcast[ETH_ALEN] = {255,255,255,255,255,255}; +const unsigned char mac_type_bridge_group[ETH_ALEN] = {0x01,0x80,0xc2,0,0,0}; +const unsigned char msk_type_bridge_group[ETH_ALEN] = {255,255,255,255,255,255}; + +/* 0: default, print only 2 digits if necessary + * 2: always print 2 digits, a printed mac address + * then always has the same length */ +int ebt_printstyle_mac; + +void ebt_print_mac(const unsigned char *mac) +{ + if (ebt_printstyle_mac == 2) { + int j; + for (j = 0; j < ETH_ALEN; j++) + printf("%02x%s", mac[j], + (j==ETH_ALEN-1) ? "" : ":"); + } else + printf("%s", ether_ntoa((struct ether_addr *) mac)); +} + +void ebt_print_mac_and_mask(const unsigned char *mac, const unsigned char *mask) +{ + char hlpmsk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + + if (!memcmp(mac, mac_type_unicast, 6) && + !memcmp(mask, msk_type_unicast, 6)) + printf("Unicast"); + else if (!memcmp(mac, mac_type_multicast, 6) && + !memcmp(mask, msk_type_multicast, 6)) + printf("Multicast"); + else if (!memcmp(mac, mac_type_broadcast, 6) && + !memcmp(mask, msk_type_broadcast, 6)) + printf("Broadcast"); + else if (!memcmp(mac, mac_type_bridge_group, 6) && + !memcmp(mask, msk_type_bridge_group, 6)) + printf("BGA"); + else { + ebt_print_mac(mac); + if (memcmp(mask, hlpmsk, 6)) { + printf("/"); + ebt_print_mac(mask); + } + } +} + +/* Checks the type for validity and calls getethertypebynumber(). */ +struct ethertypeent *parseethertypebynumber(int type) +{ + if (type < 1536) + ebt_print_error("Ethernet protocols have values >= 0x0600"); + if (type > 0xffff) + ebt_print_error("Ethernet protocols have values <= 0xffff"); + return getethertypebynumber(type); +} + +/* Put the mac address into 6 (ETH_ALEN) bytes returns 0 on success. */ +int ebt_get_mac_and_mask(const char *from, unsigned char *to, + unsigned char *mask) +{ + char *p; + int i; + struct ether_addr *addr; + + if (strcasecmp(from, "Unicast") == 0) { + memcpy(to, mac_type_unicast, ETH_ALEN); + memcpy(mask, msk_type_unicast, ETH_ALEN); + return 0; + } + if (strcasecmp(from, "Multicast") == 0) { + memcpy(to, mac_type_multicast, ETH_ALEN); + memcpy(mask, msk_type_multicast, ETH_ALEN); + return 0; + } + if (strcasecmp(from, "Broadcast") == 0) { + memcpy(to, mac_type_broadcast, ETH_ALEN); + memcpy(mask, msk_type_broadcast, ETH_ALEN); + return 0; + } + if (strcasecmp(from, "BGA") == 0) { + memcpy(to, mac_type_bridge_group, ETH_ALEN); + memcpy(mask, msk_type_bridge_group, ETH_ALEN); + return 0; + } + if ( (p = strrchr(from, ''/'')) != NULL) { + *p = ''\0''; + if (!(addr = ether_aton(p + 1))) + return -1; + memcpy(mask, addr, ETH_ALEN); + } else + memset(mask, 0xff, ETH_ALEN); + if (!(addr = ether_aton(from))) + return -1; + memcpy(to, addr, ETH_ALEN); + for (i = 0; i < ETH_ALEN; i++) + to[i] &= mask[i]; + return 0; +} + +/* 0: default + * 1: the inverse ''!'' of the option has already been specified */ +int ebt_invert = 0; + +/* + * Check if the inverse of the option is specified. This is used + * in the parse functions of the extensions and ebtables.c + */ +int _ebt_check_inverse(const char option[], int argc, char **argv) +{ + if (!option) + return ebt_invert; + if (strcmp(option, "!") == 0) { + if (ebt_invert == 1) + ebt_print_error("Double use of ''!'' not allowed"); + if (optind >= argc) + optarg = NULL; + else + optarg = argv[optind]; + optind++; + ebt_invert = 1; + return 1; + } + return ebt_invert; +} + +/* Make sure the same option wasn''t specified twice. This is used + * in the parse functions of the extensions and ebtables.c */ +void ebt_check_option(unsigned int *flags, unsigned int mask) +{ + if (*flags & mask) + ebt_print_error("Multiple use of same option not allowed"); + *flags |= mask; +} + +/* Put the ip string into 4 bytes. */ +static int undot_ip(char *ip, unsigned char *ip2) +{ + char *p, *q, *end; + long int onebyte; + int i; + char buf[20]; + + strncpy(buf, ip, sizeof(buf) - 1); + + p = buf; + for (i = 0; i < 3; i++) { + if ((q = strchr(p, ''.'')) == NULL) + return -1; + *q = ''\0''; + onebyte = strtol(p, &end, 10); + if (*end != ''\0'' || onebyte > 255 || onebyte < 0) + return -1; + ip2[i] = (unsigned char)onebyte; + p = q + 1; + } + + onebyte = strtol(p, &end, 10); + if (*end != ''\0'' || onebyte > 255 || onebyte < 0) + return -1; + ip2[3] = (unsigned char)onebyte; + + return 0; +} + +/* Put the mask into 4 bytes. */ +static int ip_mask(char *mask, unsigned char *mask2) +{ + char *end; + long int bits; + uint32_t mask22; + + if (undot_ip(mask, mask2)) { + /* not the /a.b.c.e format, maybe the /x format */ + bits = strtol(mask, &end, 10); + if (*end != ''\0'' || bits > 32 || bits < 0) + return -1; + if (bits != 0) { + mask22 = htonl(0xFFFFFFFF << (32 - bits)); + memcpy(mask2, &mask22, 4); + } else { + mask22 = 0xFFFFFFFF; + memcpy(mask2, &mask22, 4); + } + } + return 0; +} + +/* Set the ip mask and ip address. Callers should check ebt_errormsg[0]. + * The string pointed to by address can be altered. */ +void ebt_parse_ip_address(char *address, uint32_t *addr, uint32_t *msk) +{ + char *p; + + /* first the mask */ + if ((p = strrchr(address, ''/'')) != NULL) { + *p = ''\0''; + if (ip_mask(p + 1, (unsigned char *)msk)) { + ebt_print_error("Problem with the IP mask ''%s''", p + 1); + return; + } + } else + *msk = 0xFFFFFFFF; + + if (undot_ip(address, (unsigned char *)addr)) { + ebt_print_error("Problem with the IP address ''%s''", address); + return; + } + *addr = *addr & *msk; +} + + +/* Transform the ip mask into a string ready for output. */ +char *ebt_mask_to_dotted(uint32_t mask) +{ + int i; + static char buf[20]; + uint32_t maskaddr, bits; + + maskaddr = ntohl(mask); + + /* don''t print /32 */ + if (mask == 0xFFFFFFFFL) { + *buf = ''\0''; + return buf; + } + + i = 32; + bits = 0xFFFFFFFEL; /* Case 0xFFFFFFFF has just been dealt with */ + while (--i >= 0 && maskaddr != bits) + bits <<= 1; + + if (i > 0) + sprintf(buf, "/%d", i); + else if (!i) + *buf = ''\0''; + else + /* Mask was not a decent combination of 1''s and 0''s */ + sprintf(buf, "/%d.%d.%d.%d", ((unsigned char *)&mask)[0], + ((unsigned char *)&mask)[1], ((unsigned char *)&mask)[2], + ((unsigned char *)&mask)[3]); + + return buf; +} + +/* Most of the following code is derived from iptables */ +static void +in6addrcpy(struct in6_addr *dst, struct in6_addr *src) +{ + memcpy(dst, src, sizeof(struct in6_addr)); +} + +int string_to_number_ll(const char *s, unsigned long long min, + unsigned long long max, unsigned long long *ret) +{ + unsigned long long number; + char *end; + + /* Handle hex, octal, etc. */ + errno = 0; + number = strtoull(s, &end, 0); + if (*end == ''\0'' && end != s) { + /* we parsed a number, let''s see if we want this */ + if (errno != ERANGE && min <= number && (!max || number <= max)) { + *ret = number; + return 0; + } + } + return -1; +} + +int string_to_number_l(const char *s, unsigned long min, unsigned long max, + unsigned long *ret) +{ + int result; + unsigned long long number; + + result = string_to_number_ll(s, min, max, &number); + *ret = (unsigned long)number; + + return result; +} + +int string_to_number(const char *s, unsigned int min, unsigned int max, + unsigned int *ret) +{ + int result; + unsigned long number; + + result = string_to_number_l(s, min, max, &number); + *ret = (unsigned int)number; + + return result; +} + +static struct in6_addr *numeric_to_addr(const char *num) +{ + static struct in6_addr ap; + int err; + + if ((err=inet_pton(AF_INET6, num, &ap)) == 1) + return ≈ + return (struct in6_addr *)NULL; +} + +static struct in6_addr *parse_ip6_mask(char *mask) +{ + static struct in6_addr maskaddr; + struct in6_addr *addrp; + unsigned int bits; + + if (mask == NULL) { + /* no mask at all defaults to 128 bits */ + memset(&maskaddr, 0xff, sizeof maskaddr); + return &maskaddr; + } + if ((addrp = numeric_to_addr(mask)) != NULL) + return addrp; + if (string_to_number(mask, 0, 128, &bits) == -1) + ebt_print_error("Invalid IPv6 Mask ''%s'' specified", mask); + if (bits != 0) { + char *p = (char *)&maskaddr; + memset(p, 0xff, bits / 8); + memset(p + (bits / 8) + 1, 0, (128 - bits) / 8); + p[bits / 8] = 0xff << (8 - (bits & 7)); + return &maskaddr; + } + + memset(&maskaddr, 0, sizeof maskaddr); + return &maskaddr; +} + +/* Set the ipv6 mask and address. Callers should check ebt_errormsg[0]. + * The string pointed to by address can be altered. */ +void ebt_parse_ip6_address(char *address, struct in6_addr *addr, + struct in6_addr *msk) +{ + struct in6_addr *tmp_addr; + char buf[256]; + char *p; + int i; + int err; + + strncpy(buf, address, sizeof(buf) - 1); + /* first the mask */ + buf[sizeof(buf) - 1] = ''\0''; + if ((p = strrchr(buf, ''/'')) != NULL) { + *p = ''\0''; + tmp_addr = parse_ip6_mask(p + 1); + } else + tmp_addr = parse_ip6_mask(NULL); + in6addrcpy(msk, tmp_addr); + + /* if a null mask is given, the name is ignored, like in "any/0" */ + if (!memcmp(msk, &in6addr_any, sizeof(in6addr_any))) + strcpy(buf, "::"); + + if ((err=inet_pton(AF_INET6, buf, addr)) < 1) { + ebt_print_error("Invalid IPv6 Address ''%s'' specified", buf); + return; + } + + for (i = 0; i < 4; i++) + addr->s6_addr32[i] &= msk->s6_addr32[i]; +} + +/* Transform the ip6 addr into a string ready for output. */ +char *ebt_ip6_to_numeric(const struct in6_addr *addrp) +{ + /* 0000:0000:0000:0000:0000:000.000.000.000 + * 0000:0000:0000:0000:0000:0000:0000:0000 */ + static char buf[50+1]; + return (char *)inet_ntop(AF_INET6, addrp, buf, sizeof(buf)); +} diff --git a/tools/remus/kmod/Kbuild b/tools/remus/kmod/Kbuild new file mode 100644 --- /dev/null +++ b/tools/remus/kmod/Kbuild @@ -0,0 +1,1 @@ +obj-m := sch_queue.o ebt_imq.o diff --git a/tools/remus/kmod/Makefile b/tools/remus/kmod/Makefile new file mode 100644 --- /dev/null +++ b/tools/remus/kmod/Makefile @@ -0,0 +1,24 @@ +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk + +# Should make makefiles export linux build directory! +# This is a fragile hack to tide us over +ifeq ($(KERNELS),linux-2.6-xen) +LINUX_VER=2.6.18-xen +endif +ifeq ($(KERNELS),linux-2.6-xen0) +LINUX_VER=2.6.18-xen0 +endif + +KERNELDIR ?= $(XEN_ROOT)/build-linux-$(LINUX_VER)_$(XEN_TARGET_ARCH) + +.PHONY: all +all: + if test -d $(KERNELDIR); then $(MAKE) -C $(KERNELDIR) SUBDIRS=`pwd` modules; fi + +.PHONY: install +install: + if test -d $(KERNELDIR); then $(MAKE) -C $(KERNELDIR) SUBDIRS=`pwd` INSTALL_MOD_PATH=$(DESTDIR) modules_install; fi + +clean:: + -rm -rf *.o *.ko *.mod.c *.mod.o Module.symvers .*.cmd .tmp_versions diff --git a/tools/remus/kmod/ebt_imq.c b/tools/remus/kmod/ebt_imq.c new file mode 100644 --- /dev/null +++ b/tools/remus/kmod/ebt_imq.c @@ -0,0 +1,45 @@ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_bridge/ebtables.h> +#include <linux/netdevice.h> +#include "ebt_imq.h" + +static int ebt_target_imq(struct sk_buff **pskb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_imq_info *info = (struct ebt_imq_info *) data; + + (*pskb)->imq_flags = info->todev | IMQ_F_ENQUEUE; + + return EBT_CONTINUE; +} + +static int ebt_target_imq_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + return 0; +} + +static struct ebt_target imq_target +{ + .name = "imq", + .target = ebt_target_imq, + .check = ebt_target_imq_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_target(&imq_target); +} + +static void __exit fini(void) +{ + ebt_unregister_target(&imq_target); +} + + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/tools/remus/kmod/ebt_imq.h b/tools/remus/kmod/ebt_imq.h new file mode 100644 --- /dev/null +++ b/tools/remus/kmod/ebt_imq.h @@ -0,0 +1,10 @@ +#ifndef __LINUX_BRIDGE_EBT_IMQ_H +#define __LINUX_BRIDGE_EBT_IMQ_H + +#define IMQ_F_ENQUEUE 0x80 + +struct ebt_imq_info +{ + unsigned int todev; +}; +#endif diff --git a/tools/remus/kmod/sch_queue.c b/tools/remus/kmod/sch_queue.c new file mode 100644 --- /dev/null +++ b/tools/remus/kmod/sch_queue.c @@ -0,0 +1,208 @@ +/* + * sch_queue.c Queue traffic until an explicit release command + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The operation of the buffer is as follows: + * When a checkpoint begins, a barrier is inserted into the + * network queue by a netlink request (it operates by storing + * a pointer to the next packet which arrives and blocking dequeue + * when that packet is at the head of the queue). + * When a checkpoint completes (the backup acknowledges receipt), + * currently-queued packets are released. + * So it supports two operations, barrier and release. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +/* xenbus directory */ +#define FIFO_BUF (10*1024*1024) + +#define TCQ_CHECKPOINT 0 +#define TCQ_DEQUEUE 1 + +struct queue_sched_data { + /* this packet is the first packet which should not be delivered. + * If it is NULL, queue_enqueue will set it to the next packet it sees. */ + struct sk_buff *stop; +}; + +struct tc_queue_qopt { + /* 0: reset stop packet pointer + * 1: dequeue to stop pointer */ + int action; +}; + +/* borrowed from drivers/xen/netback/loopback.c */ +static int is_foreign(unsigned long pfn) +{ + /* NB. Play it safe for auto-translation mode. */ + return (xen_feature(XENFEAT_auto_translated_physmap) || + (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT)); +} + +static int skb_remove_foreign_references(struct sk_buff *skb) +{ + struct page *page; + unsigned long pfn; + int i, off; + char *vaddr; + + BUG_ON(skb_shinfo(skb)->frag_list); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page); + if (!is_foreign(pfn)) + continue; + /* + printk("foreign ref found\n"); + */ + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!page)) + return 0; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + off = skb_shinfo(skb)->frags[i].page_offset; + memcpy(page_address(page) + off, vaddr + off, + skb_shinfo(skb)->frags[i].size); + kunmap_skb_frag(vaddr); + + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = page; + } + + return 1; +} + +static int queue_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct queue_sched_data *q = qdisc_priv(sch); + + if (likely(sch->qstats.backlog + skb->len <= FIFO_BUF)) + { + if (!q->stop) + q->stop = skb; + + if (!skb_remove_foreign_references(skb)) { + printk("error removing foreign ref\n"); + return qdisc_reshape_fail(skb, sch); + } + + return qdisc_enqueue_tail(skb, sch); + } + printk("queue reported full: %d,%d\n", sch->qstats.backlog, skb->len); + + return qdisc_reshape_fail(skb, sch); +} + +/* dequeue doesn''t actually dequeue until the release command is + * received. */ +static inline struct sk_buff *queue_dequeue(struct Qdisc* sch) +{ + struct queue_sched_data *q = qdisc_priv(sch); + struct sk_buff* peek; + /* + struct timeval tv; + + if (!q->stop) { + do_gettimeofday(&tv); + printk("packet dequeued at %lu.%06lu\n", tv.tv_sec, tv.tv_usec); + } + */ + + if (sch->flags & TCQ_F_THROTTLED) + return NULL; + + peek = (struct sk_buff *)((sch->q).next); + + /* this pointer comparison may be shady */ + if (peek == q->stop) { + /* + do_gettimeofday(&tv); + printk("stop packet at %lu.%06lu\n", tv.tv_sec, tv.tv_usec); + */ + + /* this is the tail of the last round. Release it and block the queue */ + sch->flags |= TCQ_F_THROTTLED; + return NULL; + } + + return qdisc_dequeue_head(sch); +} + +static int queue_init(struct Qdisc *sch, struct rtattr *opt) +{ + sch->flags |= TCQ_F_THROTTLED; + + return 0; +} + +/* receives two messages: + * 0: checkpoint queue (set stop to next packet) + * 1: dequeue until stop */ +static int queue_change(struct Qdisc* sch, struct rtattr* opt) +{ + struct queue_sched_data *q = qdisc_priv(sch); + struct tc_queue_qopt* msg; + /* + struct timeval tv; + */ + + if (!opt || RTA_PAYLOAD(opt) < sizeof(*msg)) + return -EINVAL; + + msg = RTA_DATA(opt); + + if (msg->action == TCQ_CHECKPOINT) { + /* reset stop */ + q->stop = NULL; + } else if (msg->action == TCQ_DEQUEUE) { + /* dequeue */ + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); + /* + do_gettimeofday(&tv); + printk("queue release at %lu.%06lu (%d bytes)\n", tv.tv_sec, tv.tv_usec, + sch->qstats.backlog); + */ + } else { + return -EINVAL; + } + + return 0; +} + +struct Qdisc_ops queue_qdisc_ops = { + .id = "queue", + .priv_size = sizeof(struct queue_sched_data), + .enqueue = queue_enqueue, + .dequeue = queue_dequeue, + .init = queue_init, + .change = queue_change, + .owner = THIS_MODULE, +}; + +static int __init queue_module_init(void) +{ + printk("loading queue\n"); + return register_qdisc(&queue_qdisc_ops); +} + +static void __exit queue_module_exit(void) +{ + printk("queue unloaded\n"); + unregister_qdisc(&queue_qdisc_ops); +} +module_init(queue_module_init) +module_exit(queue_module_exit) +MODULE_LICENSE("GPL"); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2009-Nov-13 14:18 UTC
Re: [Xen-devel] [PATCH 0 of 3] Remus: control tool
On Thu, Nov 12, 2009 at 05:10:21PM -0800, Brendan Cully wrote:> The following patch series integrates the Remus control layer into > Xen. It provides a single user-visible script ("remus") to activate > Remus on a guest virtual machine, and the libraries required by that > script.Cool. Looking forward to read them.> > Network buffering requires the linux IMQ (http://linuximq.net) patch > to be applied to dom0. I''ll mail the upstream version that applies to > the linux-2.6.18-xen.hg tree separately.Is there a 2.6.31.x variant of the code? Are there plans to submit the patches to LKML? _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Friday, 13 November 2009 at 09:18, Konrad Rzeszutek Wilk wrote:> On Thu, Nov 12, 2009 at 05:10:21PM -0800, Brendan Cully wrote: > > The following patch series integrates the Remus control layer into > > Xen. It provides a single user-visible script ("remus") to activate > > Remus on a guest virtual machine, and the libraries required by that > > script. > > Cool. Looking forward to read them. > > > > Network buffering requires the linux IMQ (http://linuximq.net) patch > > to be applied to dom0. I''ll mail the upstream version that applies to > > the linux-2.6.18-xen.hg tree separately. > > Is there a 2.6.31.x variant of the code? Are there plans to submit the > patches to LKML?I haven''t tried to get this working on pvops yet, but it''s simple in theory: domU ought to work now, albeit somewhat slowly since the pvops domU doesn''t support suspend requests over a dedicated event channel. Cooking up a patch for this is probably not too hard. dom0 requires nothing but the upstream IMQ patch, and blktap2 support, which I am told it currently lacks, but I''m sure it will get in the not-too-distant future. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Fri, Nov 13, 2009 at 01:36:27PM -0800, Brendan Cully wrote:> On Friday, 13 November 2009 at 09:18, Konrad Rzeszutek Wilk wrote: > > On Thu, Nov 12, 2009 at 05:10:21PM -0800, Brendan Cully wrote: > > > The following patch series integrates the Remus control layer into > > > Xen. It provides a single user-visible script ("remus") to activate > > > Remus on a guest virtual machine, and the libraries required by that > > > script. > > > > Cool. Looking forward to read them. > > > > > > Network buffering requires the linux IMQ (http://linuximq.net) patch > > > to be applied to dom0. I''ll mail the upstream version that applies to > > > the linux-2.6.18-xen.hg tree separately. > > > > Is there a 2.6.31.x variant of the code? Are there plans to submit the > > patches to LKML? > > I haven''t tried to get this working on pvops yet, but it''s simple in > theory: > > domU ought to work now, albeit somewhat slowly since the pvops domU > doesn''t support suspend requests over a dedicated event > channel. Cooking up a patch for this is probably not too hard. >Do you know if anyone is working on the pvops support for suspend requests over dedicated even channel? -- Pasi _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Tuesday, 01 December 2009 at 17:20, Pasi Kärkkäinen wrote:> On Fri, Nov 13, 2009 at 01:36:27PM -0800, Brendan Cully wrote: > > On Friday, 13 November 2009 at 09:18, Konrad Rzeszutek Wilk wrote: > > > On Thu, Nov 12, 2009 at 05:10:21PM -0800, Brendan Cully wrote: > > > > The following patch series integrates the Remus control layer into > > > > Xen. It provides a single user-visible script ("remus") to activate > > > > Remus on a guest virtual machine, and the libraries required by that > > > > script. > > > > > > Cool. Looking forward to read them. > > > > > > > > Network buffering requires the linux IMQ (http://linuximq.net) patch > > > > to be applied to dom0. I''ll mail the upstream version that applies to > > > > the linux-2.6.18-xen.hg tree separately. > > > > > > Is there a 2.6.31.x variant of the code? Are there plans to submit the > > > patches to LKML? > > > > I haven''t tried to get this working on pvops yet, but it''s simple in > > theory: > > > > domU ought to work now, albeit somewhat slowly since the pvops domU > > doesn''t support suspend requests over a dedicated event > > channel. Cooking up a patch for this is probably not too hard. > > > > Do you know if anyone is working on the pvops support for suspend requests > over dedicated even channel?I''ll probably take a crack at this soon. From a quick look at the pvops suspend code, it seems like it may suffer from a race when multiple suspends are issued that Keir fixed in the 2.6.18 tree some time ago -- it''d be better to get that fixed before porting the event channel patch. In the meantime, it looks like I didn''t include fallback support for checkpointing with xenstore, so Remus doesn''t support pvops domu at all. I should have a patch to support the slow mode out today. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2009-Dec-01 23:04 UTC
Re: [Xen-devel] [PATCH 0 of 3] Remus: control tool
On 12/01/09 08:16, Brendan Cully wrote:> I''ll probably take a crack at this soon. From a quick look at the > pvops suspend code, it seems like it may suffer from a race when > multiple suspends are issued that Keir fixed in the 2.6.18 tree some > time ago -- it''d be better to get that fixed before porting the event > channel patch. >What''s the bug? I don''t see a reentrancy issue there because the suspend happens synchronously in the xenwatch thread under xenwatch_mutex. Or race for that matter. Am I missing something? Of course, if you start triggering suspends via event channels, we''ll need to work out something else. J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Tuesday, 01 December 2009 at 15:04, Jeremy Fitzhardinge wrote:> On 12/01/09 08:16, Brendan Cully wrote: > > I''ll probably take a crack at this soon. From a quick look at the > > pvops suspend code, it seems like it may suffer from a race when > > multiple suspends are issued that Keir fixed in the 2.6.18 tree some > > time ago -- it''d be better to get that fixed before porting the event > > channel patch. > > > > What''s the bug? I don''t see a reentrancy issue there because the > suspend happens synchronously in the xenwatch thread under > xenwatch_mutex. Or race for that matter. > > Am I missing something?I''ll need to take a closer look, but the race I''m remembering was fixed in the 2.6.18 tree here: http://xenbits.xen.org/linux-2.6.18-xen.hg/rev/49ffe9ef67d4> Of course, if you start triggering suspends via event channels, we''ll > need to work out something else.yep. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 01/12/2009 23:04, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:> What''s the bug? I don''t see a reentrancy issue there because the > suspend happens synchronously in the xenwatch thread under > xenwatch_mutex. Or race for that matter. > > Am I missing something? > > > Of course, if you start triggering suspends via event channels, we''ll > need to work out something else.The issue in 2.6.18 was that, if doing back-to-back save/restores, the next event-channel notification could come in before domU was finished with previous s/r cycle, and then the notification got dropped. There are a number of ways of dealing with that of course: I implemented a little state machine; or you could probably do it with some kind of ticket-based scheme; or perhaps have the evtchn irq handler spawn a kthread which blocks on the mutex (I liked that one least as it needs to allocate resources). -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2009-Dec-02 18:17 UTC
Re: [Xen-devel] [PATCH 0 of 3] Remus: control tool
On 12/02/09 00:07, Keir Fraser wrote:> The issue in 2.6.18 was that, if doing back-to-back save/restores, the next > event-channel notification could come in before domU was finished with > previous s/r cycle, and then the notification got dropped. There are a > number of ways of dealing with that of course: I implemented a little state > machine; or you could probably do it with some kind of ticket-based scheme; > or perhaps have the evtchn irq handler spawn a kthread which blocks on the > mutex (I liked that one least as it needs to allocate resources). >Hm, my first thought is "why does that matter?". But I guess the host/guest save protocol is fairly brittle, and if the guest doesn''t respond to a particular save it will get wedged. But then, should the control stack be sending back to back save requests? Shouldn''t it wait until the previous save has finished? J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Wednesday, 02 December 2009 at 10:17, Jeremy Fitzhardinge wrote:> On 12/02/09 00:07, Keir Fraser wrote: > > The issue in 2.6.18 was that, if doing back-to-back save/restores, the next > > event-channel notification could come in before domU was finished with > > previous s/r cycle, and then the notification got dropped. There are a > > number of ways of dealing with that of course: I implemented a little state > > machine; or you could probably do it with some kind of ticket-based scheme; > > or perhaps have the evtchn irq handler spawn a kthread which blocks on the > > mutex (I liked that one least as it needs to allocate resources). > > > > Hm, my first thought is "why does that matter?". But I guess the > host/guest save protocol is fairly brittle, and if the guest doesn''t > respond to a particular save it will get wedged. But then, should the > control stack be sending back to back save requests? Shouldn''t it wait > until the previous save has finished?What signal do you have in mind for telling the control stack that the guest has completed its resume procedure and is running normally again? _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On 02/12/2009 18:17, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:> On 12/02/09 00:07, Keir Fraser wrote: >> The issue in 2.6.18 was that, if doing back-to-back save/restores, the next >> event-channel notification could come in before domU was finished with >> previous s/r cycle, and then the notification got dropped. There are a >> number of ways of dealing with that of course: I implemented a little state >> machine; or you could probably do it with some kind of ticket-based scheme; >> or perhaps have the evtchn irq handler spawn a kthread which blocks on the >> mutex (I liked that one least as it needs to allocate resources). > > Hm, my first thought is "why does that matter?". But I guess the > host/guest save protocol is fairly brittle, and if the guest doesn''t > respond to a particular save it will get wedged. But then, should the > control stack be sending back to back save requests? Shouldn''t it wait > until the previous save has finished?>From the tools p.o.v. the restore has finished when it kicks off executionof the guest. It''s not that hard to handle this in the guest; you just need to do it. ;-) -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2009-Dec-02 18:30 UTC
Re: [Xen-devel] [PATCH 0 of 3] Remus: control tool
On 12/02/09 10:20, Brendan Cully wrote:> What signal do you have in mind for telling the control stack that the > guest has completed its resume procedure and is running normally > again? >Hm, point. I was thinking of waiting for the "I''m suspended" hypercall, but that doesn''t help. It has always worried me that the suspend protocol is very brittle. For example, there''s no way for a guest to reject a suspend attempt, either because it doesn''t support suspending, or it isn''t convenient right now, or it tried but failed. A backchannel xenstore entry would allow the guest to indicate what stage its up to to the control stack, including holding off suspend attempts until it is ready to accept new ones. But that doesn''t help much if you want to eliminate the xenstore overhead from the process... J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2009-Dec-02 18:34 UTC
Re: [Xen-devel] [PATCH 0 of 3] Remus: control tool
On 12/02/09 10:25, Keir Fraser wrote:> From the tools p.o.v. the restore has finished when it kicks off execution > of the guest. It''s not that hard to handle this in the guest; you just need > to do it. ;-)Well, I think that just happens at the moment; the suspend is happening from the xenstore watch thread, so there''s no way it will even notice a subsequent attempt until the suspend/resume cycle is done. When we start triggering suspends from event channels, I''d suggest something along the lines of wrapping the whole thing in a big fat suspend mutex, and running the suspend from a workqueue triggered from the interrupt or watch handler. And then deal with all the edge cases. J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel