[libvirt] [RFC] Substitute poll by epoll in virEventPollRunOnce

Derbyshev Dmitriy dderbyshev at virtuozzo.com
Fri Feb 10 13:29:27 UTC 2017


From: Derbyshev Dmitry <dderbyshev at virtuozzo.com>

This makes it possible to avoid allocations in
virEventPollMakePollFDs, as well as generally reduces time spent in
kernel if handles remain the same, which should generally be true.

__________________________________________

Questions:
1) What to do with probe in virEventPollRunOnce?
2) Are ifdef an acceptable solution to epoll avaliability issues?
3) Is using MAX_POLL_EVENTS_AT_ONCE constant a valid solution?
4) some fds are sometimes added twice, but only once with events!=0? This complicates virEventPoll*Handle fuctions. At the very least, it is called twice in _dbus_watch_list_set_functions.

Signed-off-by: Derbyshev Dmitry <dderbyshev at virtuozzo.com>
---
 configure.ac                     |  26 +++++
 src/util/vireventpoll.c          | 205 ++++++++++++++++++++++++++++++++++++---
 src/util/vireventpoll.h          |   5 +
 tests/commanddata/test3epoll.log |  15 +++
 tests/commandtest.c              |   4 +
 5 files changed, 242 insertions(+), 13 deletions(-)
 create mode 100644 tests/commanddata/test3epoll.log

diff --git a/configure.ac b/configure.ac
index dfc536f..c985ccc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2695,6 +2695,32 @@ AC_DEFINE_UNQUOTED([isbase64],[libvirt_gl_isbase64],[Hack to avoid symbol clash]
 AC_DEFINE_UNQUOTED([base64_encode],[libvirt_gl_base64_encode],[Hack to avoid symbol clash])
 AC_DEFINE_UNQUOTED([base64_encode_alloc],[libvirt_gl_base64_encode_alloc],[Hack to avoid symbol clash])
 
+AC_ARG_ENABLE([epoll],
+              [AS_HELP_STRING([--enable-epoll],[use epoll(4) on Linux])],
+              [enable_epoll=$enableval], [enable_epoll=auto])
+if test x$enable_epoll = xno; then
+    have_linux_epoll=no
+else
+    AC_MSG_CHECKING([for Linux epoll(4)])
+    AC_LINK_IFELSE([AC_LANG_PROGRAM(
+        [
+        #ifndef __linux__
+        #error This is not Linux
+        #endif
+        #include <sys/epoll.h>
+        ],
+        [epoll_create1 (EPOLL_CLOEXEC);])],
+        [have_linux_epoll=yes],
+        [have_linux_epoll=no])
+    AC_MSG_RESULT([$have_linux_epoll])
+fi
+if test x$enable_epoll,$have_linux_epoll = xyes,no; then
+    AC_MSG_ERROR([epoll support explicitly enabled but not available])
+fi
+if test "x$have_linux_epoll" = xyes; then
+  AC_DEFINE_UNQUOTED([HAVE_LINUX_EPOLL], 1, [whether epoll is supported])
+fi
+
 AC_CONFIG_FILES([run],
                 [chmod +x,-w run])
 AC_CONFIG_FILES([\
diff --git a/src/util/vireventpoll.c b/src/util/vireventpoll.c
index 81ecab4..1541706 100644
--- a/src/util/vireventpoll.c
+++ b/src/util/vireventpoll.c
@@ -25,7 +25,11 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <poll.h>
+#ifdef HAVE_LINUX_EPOLL
+# include <sys/epoll.h>
+#else
+# include <poll.h>
+#endif
 #include <sys/time.h>
 #include <errno.h>
 #include <unistd.h>
@@ -87,6 +91,9 @@ struct virEventPollLoop {
     size_t timeoutsCount;
     size_t timeoutsAlloc;
     struct virEventPollTimeout *timeouts;
+#ifdef HAVE_LINUX_EPOLL
+    int epollfd;
+#endif
 };
 
 /* Only have one event loop */
@@ -109,6 +116,11 @@ int virEventPollAddHandle(int fd, int events,
                           virFreeCallback ff)
 {
     int watch;
+#ifdef HAVE_LINUX_EPOLL
+    size_t i;
+    struct epoll_event ev;
+#endif
+    int native = virEventPollToNativeEvents(events);
     virMutexLock(&eventLoop.lock);
     if (eventLoop.handlesCount == eventLoop.handlesAlloc) {
         EVENT_DEBUG("Used %zu handle slots, adding at least %d more",
@@ -124,8 +136,7 @@ int virEventPollAddHandle(int fd, int events,
 
     eventLoop.handles[eventLoop.handlesCount].watch = watch;
     eventLoop.handles[eventLoop.handlesCount].fd = fd;
-    eventLoop.handles[eventLoop.handlesCount].events =
-                                         virEventPollToNativeEvents(events);
+    eventLoop.handles[eventLoop.handlesCount].events = native;
     eventLoop.handles[eventLoop.handlesCount].cb = cb;
     eventLoop.handles[eventLoop.handlesCount].ff = ff;
     eventLoop.handles[eventLoop.handlesCount].opaque = opaque;
@@ -133,7 +144,30 @@ int virEventPollAddHandle(int fd, int events,
 
     eventLoop.handlesCount++;
 
+#ifdef HAVE_LINUX_EPOLL
+    ev.events = native;
+    ev.data.fd = fd;
+    if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+        if (errno == EEXIST) {
+            for (i = 0; i < eventLoop.handlesCount; i++) {
+                if (eventLoop.handles[i].fd == fd &&
+                    !eventLoop.handles[i].deleted) {
+                    ev.events |= eventLoop.handles[i].events;
+                }
+            }
+            if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_MOD, fd, &ev) < 0) {
+                virMutexUnlock(&eventLoop.lock);
+                return -1;
+            }
+        }
+        else {
+            virMutexUnlock(&eventLoop.lock);
+            return -1;
+        }
+    }
+#else
     virEventPollInterruptLocked();
+#endif
 
     PROBE(EVENT_POLL_ADD_HANDLE,
           "watch=%d fd=%d events=%d cb=%p opaque=%p ff=%p",
@@ -145,8 +179,14 @@ int virEventPollAddHandle(int fd, int events,
 
 void virEventPollUpdateHandle(int watch, int events)
 {
+#ifdef HAVE_LINUX_EPOLL
+    struct epoll_event ev;
+    size_t i, j;
+#else
     size_t i;
+#endif
     bool found = false;
+    int native = virEventPollToNativeEvents(events);
     PROBE(EVENT_POLL_UPDATE_HANDLE,
           "watch=%d events=%d",
           watch, events);
@@ -159,13 +199,34 @@ void virEventPollUpdateHandle(int watch, int events)
     virMutexLock(&eventLoop.lock);
     for (i = 0; i < eventLoop.handlesCount; i++) {
         if (eventLoop.handles[i].watch == watch) {
-            eventLoop.handles[i].events =
-                    virEventPollToNativeEvents(events);
+            eventLoop.handles[i].events = native;
+#ifndef HAVE_LINUX_EPOLL
             virEventPollInterruptLocked();
+#endif
             found = true;
             break;
         }
     }
+
+#ifdef HAVE_LINUX_EPOLL
+    ev.events = native;
+    for (j = 0; j < eventLoop.handlesCount; j++) {
+        if (eventLoop.handles[i].fd == eventLoop.handles[i].fd &&
+            !eventLoop.handles[i].deleted &&
+            i != j) {
+            ev.events |= eventLoop.handles[i].events;
+        }
+    }
+    ev.data.fd = eventLoop.handles[i].fd;
+
+    if (found && epoll_ctl(eventLoop.epollfd, EPOLL_CTL_MOD,
+                           eventLoop.handles[i].fd, &ev) < 0) {
+        VIR_WARN("Update for existing handle watch %d failed", watch);
+        virMutexUnlock(&eventLoop.lock);
+        return;
+    }
+#endif
+
     virMutexUnlock(&eventLoop.lock);
 
     if (!found)
@@ -180,7 +241,12 @@ void virEventPollUpdateHandle(int watch, int events)
  */
 int virEventPollRemoveHandle(int watch)
 {
+#ifdef HAVE_LINUX_EPOLL
+    struct epoll_event ev;
+    size_t i, j;
+#else
     size_t i;
+#endif
     PROBE(EVENT_POLL_REMOVE_HANDLE,
           "watch=%d",
           watch);
@@ -196,15 +262,47 @@ int virEventPollRemoveHandle(int watch)
             continue;
 
         if (eventLoop.handles[i].watch == watch) {
-            EVENT_DEBUG("mark delete %zu %d", i, eventLoop.handles[i].fd);
-            eventLoop.handles[i].deleted = 1;
-            virEventPollInterruptLocked();
+            break;
+        }
+    }
+
+    if (i == eventLoop.handlesCount) {
+        virMutexUnlock(&eventLoop.lock);
+        return -1;
+    }
+
+#ifdef HAVE_LINUX_EPOLL
+    ev.events = 0;
+    for (j = 0; j < eventLoop.handlesCount; j++) {
+        if (eventLoop.handles[j].fd == eventLoop.handles[i].fd &&
+            !eventLoop.handles[j].deleted &&
+            i != j) {
+            ev.events |= eventLoop.handles[i].events;
+        }
+    }
+
+    ev.data.fd = eventLoop.handles[i].fd;
+    if (ev.events) {
+        if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_MOD, eventLoop.handles[i].fd,
+                      &ev) < 0) {
             virMutexUnlock(&eventLoop.lock);
-            return 0;
+            return -1;
         }
     }
+    else {
+        if (epoll_ctl(eventLoop.epollfd, EPOLL_CTL_DEL, eventLoop.handles[i].fd,
+                      &ev) < 0) {
+            virMutexUnlock(&eventLoop.lock);
+            return -1;
+        }
+    }
+#endif
+
+    EVENT_DEBUG("mark delete %zu %d", i, eventLoop.handles[i].fd);
+    eventLoop.handles[i].deleted = 1;
+    virEventPollInterruptLocked();
     virMutexUnlock(&eventLoop.lock);
-    return -1;
+    return 0;
 }
 
 
@@ -373,6 +471,7 @@ static int virEventPollCalculateTimeout(int *timeout)
     return 0;
 }
 
+#ifndef HAVE_LINUX_EPOLL
 /*
  * Allocate a pollfd array containing data for all registered
  * file handles. The caller must free the returned data struct
@@ -409,6 +508,7 @@ static struct pollfd *virEventPollMakePollFDs(int *nfds) {
 
     return fds;
 }
+#endif
 
 
 /*
@@ -472,7 +572,11 @@ static int virEventPollDispatchTimeouts(void)
  *
  * Returns 0 upon success, -1 if an error occurred
  */
+#ifdef HAVE_LINUX_EPOLL
+static int virEventPollDispatchHandles(int nfds, struct epoll_event *events)
+#else
 static int virEventPollDispatchHandles(int nfds, struct pollfd *fds)
+#endif
 {
     size_t i, n;
     VIR_DEBUG("Dispatch %d", nfds);
@@ -482,7 +586,11 @@ static int virEventPollDispatchHandles(int nfds, struct pollfd *fds)
      * in the fds array we've got */
     for (i = 0, n = 0; n < nfds && i < eventLoop.handlesCount; n++) {
         while (i < eventLoop.handlesCount &&
+#ifdef HAVE_LINUX_EPOLL
+               (eventLoop.handles[i].fd != events[n].data.fd ||
+#else
                (eventLoop.handles[i].fd != fds[n].fd ||
+#endif
                 eventLoop.handles[i].events == 0)) {
             i++;
         }
@@ -496,16 +604,25 @@ static int virEventPollDispatchHandles(int nfds, struct pollfd *fds)
             continue;
         }
 
+#ifdef HAVE_LINUX_EPOLL
+        if (events[n].events) {
+            int hEvents = virEventPollFromNativeEvents(events[n].events);
+#else
         if (fds[n].revents) {
+            int hEvents = virEventPollFromNativeEvents(fds[n].revents);
+#endif
             virEventHandleCallback cb = eventLoop.handles[i].cb;
             int watch = eventLoop.handles[i].watch;
             void *opaque = eventLoop.handles[i].opaque;
-            int hEvents = virEventPollFromNativeEvents(fds[n].revents);
             PROBE(EVENT_POLL_DISPATCH_HANDLE,
                   "watch=%d events=%d",
                   watch, hEvents);
             virMutexUnlock(&eventLoop.lock);
+#ifdef HAVE_LINUX_EPOLL
+            (cb)(watch, events[n].data.fd, hEvents, opaque);
+#else
             (cb)(watch, fds[n].fd, hEvents, opaque);
+#endif
             virMutexLock(&eventLoop.lock);
         }
     }
@@ -618,7 +735,11 @@ static void virEventPollCleanupHandles(void)
  */
 int virEventPollRunOnce(void)
 {
+#ifdef HAVE_LINUX_EPOLL
+    struct epoll_event events[MAX_POLL_EVENTS_AT_ONCE];
+#else
     struct pollfd *fds = NULL;
+#endif
     int ret, timeout, nfds;
 
     virMutexLock(&eventLoop.lock);
@@ -628,17 +749,29 @@ int virEventPollRunOnce(void)
     virEventPollCleanupTimeouts();
     virEventPollCleanupHandles();
 
-    if (!(fds = virEventPollMakePollFDs(&nfds)) ||
-        virEventPollCalculateTimeout(&timeout) < 0)
+#ifndef HAVE_LINUX_EPOLL
+    if (!(fds = virEventPollMakePollFDs(&nfds)))
+        goto error;
+#endif
+    if (virEventPollCalculateTimeout(&timeout) < 0)
         goto error;
 
     virMutexUnlock(&eventLoop.lock);
 
  retry:
+#ifdef HAVE_LINUX_EPOLL
+//FIXME: PROBE handles
+    PROBE(EVENT_POLL_RUN,
+          "nhandles=%d timeout=%d",
+          42, timeout);
+    nfds = ret = epoll_wait(eventLoop.epollfd, events,
+                     MAX_POLL_EVENTS_AT_ONCE, timeout);
+#else
     PROBE(EVENT_POLL_RUN,
           "nhandles=%d timeout=%d",
           nfds, timeout);
     ret = poll(fds, nfds, timeout);
+#endif
     if (ret < 0) {
         EVENT_DEBUG("Poll got error event %d", errno);
         if (errno == EINTR || errno == EAGAIN)
@@ -653,22 +786,32 @@ int virEventPollRunOnce(void)
     if (virEventPollDispatchTimeouts() < 0)
         goto error;
 
+#ifdef HAVE_LINUX_EPOLL
+    if (ret > 0 &&
+        virEventPollDispatchHandles(nfds, events) < 0)
+        goto error;
+#else
     if (ret > 0 &&
         virEventPollDispatchHandles(nfds, fds) < 0)
         goto error;
+#endif
 
     virEventPollCleanupTimeouts();
     virEventPollCleanupHandles();
 
     eventLoop.running = 0;
     virMutexUnlock(&eventLoop.lock);
+#ifndef HAVE_LINUX_EPOLL
     VIR_FREE(fds);
+#endif
     return 0;
 
  error:
     virMutexUnlock(&eventLoop.lock);
  error_unlocked:
+#ifndef HAVE_LINUX_EPOLL
     VIR_FREE(fds);
+#endif
     return -1;
 }
 
@@ -698,6 +841,17 @@ int virEventPollInit(void)
         return -1;
     }
 
+#ifdef HAVE_LINUX_EPOLL
+    eventLoop.epollfd = epoll_create1(0);
+    if (eventLoop.epollfd < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("Unable to initialize epoll"));
+        VIR_FORCE_CLOSE(eventLoop.wakeupfd[0]);
+        VIR_FORCE_CLOSE(eventLoop.wakeupfd[1]);
+        return -1;
+    }
+#endif
+
     if (virEventPollAddHandle(eventLoop.wakeupfd[0],
                               VIR_EVENT_HANDLE_READABLE,
                               virEventPollHandleWakeup, NULL, NULL) < 0) {
@@ -706,6 +860,9 @@ int virEventPollInit(void)
                        eventLoop.wakeupfd[0]);
         VIR_FORCE_CLOSE(eventLoop.wakeupfd[0]);
         VIR_FORCE_CLOSE(eventLoop.wakeupfd[1]);
+#ifdef HAVE_LINUX_EPOLL
+        VIR_FORCE_CLOSE(eventLoop.epollfd);
+#endif
         return -1;
     }
 
@@ -742,6 +899,16 @@ int
 virEventPollToNativeEvents(int events)
 {
     int ret = 0;
+#ifdef HAVE_LINUX_EPOLL
+    if (events & VIR_EVENT_HANDLE_READABLE)
+        ret |= EPOLLIN;
+    if (events & VIR_EVENT_HANDLE_WRITABLE)
+        ret |= EPOLLOUT;
+    if (events & VIR_EVENT_HANDLE_ERROR)
+        ret |= EPOLLERR;
+    if (events & VIR_EVENT_HANDLE_HANGUP)
+        ret |= EPOLLHUP;
+#else
     if (events & VIR_EVENT_HANDLE_READABLE)
         ret |= POLLIN;
     if (events & VIR_EVENT_HANDLE_WRITABLE)
@@ -750,6 +917,7 @@ virEventPollToNativeEvents(int events)
         ret |= POLLERR;
     if (events & VIR_EVENT_HANDLE_HANGUP)
         ret |= POLLHUP;
+#endif
     return ret;
 }
 
@@ -757,6 +925,16 @@ int
 virEventPollFromNativeEvents(int events)
 {
     int ret = 0;
+#ifdef HAVE_LINUX_EPOLL
+    if (events & EPOLLIN)
+        ret |= VIR_EVENT_HANDLE_READABLE;
+    if (events & EPOLLOUT)
+        ret |= VIR_EVENT_HANDLE_WRITABLE;
+    if (events & EPOLLERR)
+        ret |= VIR_EVENT_HANDLE_ERROR;
+    if (events & EPOLLHUP)
+        ret |= VIR_EVENT_HANDLE_HANGUP;
+#else
     if (events & POLLIN)
         ret |= VIR_EVENT_HANDLE_READABLE;
     if (events & POLLOUT)
@@ -767,5 +945,6 @@ virEventPollFromNativeEvents(int events)
         ret |= VIR_EVENT_HANDLE_ERROR;
     if (events & POLLHUP)
         ret |= VIR_EVENT_HANDLE_HANGUP;
+#endif
     return ret;
 }
diff --git a/src/util/vireventpoll.h b/src/util/vireventpoll.h
index 8844c9c..172009a 100644
--- a/src/util/vireventpoll.h
+++ b/src/util/vireventpoll.h
@@ -26,6 +26,11 @@
 
 # include "internal.h"
 
+# ifdef HAVE_LINUX_EPOLL
+/* Maximum number of events that are returned by epoll in virEventPollRunOnce */
+#  define MAX_POLL_EVENTS_AT_ONCE 10
+# endif
+
 /**
  * virEventPollAddHandle: register a callback for monitoring file handle events
  *
diff --git a/tests/commanddata/test3epoll.log b/tests/commanddata/test3epoll.log
new file mode 100644
index 0000000..e4619b3
--- /dev/null
+++ b/tests/commanddata/test3epoll.log
@@ -0,0 +1,15 @@
+ENV:DISPLAY=:0.0
+ENV:HOME=/home/test
+ENV:HOSTNAME=test
+ENV:LANG=C
+ENV:LOGNAME=testTMPDIR=/tmp
+ENV:PATH=/usr/bin:/bin
+ENV:USER=test
+FD:0
+FD:1
+FD:2
+FD:6
+FD:8
+DAEMON:no
+CWD:/tmp
+UMASK:0022
diff --git a/tests/commandtest.c b/tests/commandtest.c
index 7bf5447..6b23659 100644
--- a/tests/commandtest.c
+++ b/tests/commandtest.c
@@ -227,7 +227,11 @@ static int test3(const void *unused ATTRIBUTE_UNUSED)
         goto cleanup;
     }
 
+#ifdef HAVE_LINUX_EPOLL
+    ret = checkoutput("test3epoll", NULL);
+#else
     ret = checkoutput("test3", NULL);
+#endif
 
  cleanup:
     virCommandFree(cmd);
-- 
1.9.5.msysgit.0




More information about the libvir-list mailing list