[dm-devel] [RFC PATCH 5/5] multipathd: RFC add new polling dmevents waiter thread

Martin Wilck mwilck at suse.com
Sat Feb 10 19:55:53 UTC 2018


Hi Ben,

thanks a lot for this. I have only a few minor nitpicks (see below).
I suppose you've tested this already?

Regards
Martin

On Fri, 2018-02-09 at 23:07 -0600, Benjamin Marzinski wrote:
> The current method of waiting for dmevents on multipath devices
> involves
> creating a seperate thread for each device. This can become very
> wasteful when there are large numbers of multipath devices. Also,
> since
> multipathd needs to grab the vecs lock to update the devices, the
> additional threads don't actually provide much parallelism.
> 
> The patch adds a new method of updating multipath devices on
> dmevents,
> which uses the new device-mapper event polling interface. This means
> that there is only one dmevent waiting thread which will wait for
> events
> on all of the multipath devices.  Currently the code to get the event
> number from the list of device names and to re-arm the polling
> interface
> is not in libdevmapper, so the patch does that work. Obviously, these
> bits need to go into libdevmapper, so that multipathd can use a
> standard
> interface.
> 
> I haven't touched any of the existing event waiting code, since event
> polling was only added to device-mapper in version
> 4.37.0.  multipathd
> checks this version, and defaults to using the polling code if
> device-mapper supports it. This can be overridden by running
> multipathd
> with "-w", to force it to use the old event waiting code.

Why use a command line option here rather than a config file option?

> 
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
>  multipathd/Makefile   |   3 +-
>  multipathd/dmevents.c | 396
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  multipathd/dmevents.h |  13 ++
>  multipathd/main.c     |  58 +++++++-
>  4 files changed, 461 insertions(+), 9 deletions(-)
>  create mode 100644 multipathd/dmevents.c
>  create mode 100644 multipathd/dmevents.h
> 
> diff --git a/multipathd/Makefile b/multipathd/Makefile
> index 85f29a7..4c438f0 100644
> --- a/multipathd/Makefile
> +++ b/multipathd/Makefile
> @@ -22,7 +22,8 @@ ifdef SYSTEMD
>  	endif
>  endif
>  
> -OBJS = main.o pidfile.o uxlsnr.o uxclnt.o cli.o cli_handlers.o
> waiter.o
> +OBJS = main.o pidfile.o uxlsnr.o uxclnt.o cli.o cli_handlers.o
> waiter.o \
> +       dmevents.o
>  
>  EXEC = multipathd
>  
> diff --git a/multipathd/dmevents.c b/multipathd/dmevents.c
> new file mode 100644
> index 0000000..a56c055
> --- /dev/null
> +++ b/multipathd/dmevents.c
> @@ -0,0 +1,396 @@
> +/*
> + * Copyright (c) 2004, 2005 Christophe Varoqui
> + * Copyright (c) 2005 Kiyoshi Ueda, NEC
> + * Copyright (c) 2005 Edward Goggin, EMC
> + * Copyright (c) 2005, 2018 Benjamin Marzinski, Redhat
> + */
> +#include <unistd.h>
> +#include <libdevmapper.h>
> +#include <sys/mman.h>
> +#include <pthread.h>
> +#include <urcu.h>
> +#include <poll.h>
> +#include <sys/ioctl.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <linux/dm-ioctl.h>
> +#include <errno.h>
> +
> +#include "vector.h"
> +#include "structs.h"
> +#include "structs_vec.h"
> +#include "devmapper.h"
> +#include "debug.h"
> +#include "dmevents.h"
> +
> +#ifndef DM_DEV_ARM_POLL
> +#define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD + 1,
> struct dm_ioctl)
> +#endif
> +
> +enum event_actions {
> +	EVENT_NOTHING,
> +	EVENT_REMOVE,
> +	EVENT_UPDATE,
> +};
> +
> +struct dev_event {
> +	char name[WWID_SIZE];
> +	uint32_t evt_nr;
> +	enum event_actions action;
> +};
> +
> +struct dmevent_waiter {
> +	int fd;
> +	struct vectors *vecs;
> +	vector events;
> +	pthread_mutex_t events_lock;
> +};
> +
> +static struct dmevent_waiter *waiter;
> +
> +int dmevent_poll_supported(void)
> +{
> +	unsigned int minv[3] = {4, 37, 0};
> +	unsigned int v[3];
> +
> +	if (dm_drv_version(v))
> +		return 0;
> +
> +	if (VERSION_GE(v, minv))
> +		return 1;
> +	return 0;
> +}
> +
> +
> +int alloc_dmevent_waiter(struct vectors *vecs)
> +{
> +	if (!vecs) {
> +		condlog(0, "can't create waiter structure. invalid
> vectors");
> +		goto fail;
> +	}
> +	waiter = (struct dmevent_waiter *)malloc(sizeof(struct
> dmevent_waiter));
> +	if (!waiter) {
> +		condlog(0, "failed to allocate waiter structure");
> +		goto fail;
> +	}
> +	memset(waiter, 0, sizeof(struct dmevent_waiter));
> +	waiter->events = vector_alloc();
> +	if (!waiter->events) {
> +		condlog(0, "failed to allocate waiter events
> vector");
> +		goto fail_waiter;
> +	}
> +	waiter->fd = open("/dev/mapper/control", O_RDWR);
> +	if (waiter->fd < 0) {
> +		condlog(0, "failed to open /dev/mapper/control for
> waiter");
> +		goto fail_events;
> +	}
> +	pthread_mutex_init(&waiter->events_lock, NULL);
> +	waiter->vecs = vecs;
> +
> +	return 0;
> +fail_events:
> +	vector_free(waiter->events);
> +fail_waiter:
> +	free(waiter);
> +fail:
> +	waiter = NULL;
> +	return -1;
> +}

Nitpick: conventionally, an "alloc"-type function would return the
pointer, and NULL on failure.

> +
> +void free_dmevent_waiter(void)
> +{
> +	struct dev_event *dev_evt;
> +	int i;
> +
> +	if (!waiter)
> +		return;
> +	pthread_mutex_destroy(&waiter->events_lock);
> +	close(waiter->fd);
> +	vector_foreach_slot(waiter->events, dev_evt, i)
> +		free(dev_evt);
> +	vector_free(waiter->events);
> +	free(waiter);
> +	waiter = NULL;
> +}

Nitpick: Similarly, a "free" function typically takes the pointer to be
freed as argument. 

> +
> +static int arm_dm_event_poll(int fd)
> +{
> +	struct dm_ioctl dmi;
> +	memset(&dmi, 0, sizeof(dmi));
> +	dmi.version[0] = DM_VERSION_MAJOR;
> +	dmi.version[1] = DM_VERSION_MINOR;
> +	dmi.version[2] = DM_VERSION_PATCHLEVEL;
> +	dmi.flags = 0x4;

What's the meaning of this flag? I couldn't find it in dm-ioctl.h

> +	dmi.data_start = offsetof(struct dm_ioctl, data);
> +	dmi.data_size = sizeof(dmi);
> +	return ioctl(fd, DM_DEV_ARM_POLL, &dmi);
> +}
> +
> +/*
> + * As of version 4.37.0 device-mapper stores the event number in the
> + * dm_names structure after the name, when DM_DEVICE_LIST is called
> + */
> +static uint32_t dm_event_nr(struct dm_names *n)
> +{
> +	return *(uint32_t *)(((uintptr_t)(strchr(n->name, 0) + 1) +
> 7) & ~7);
> +}
> +
> +static int dm_get_events(void)
> +{
> +	struct dm_task *dmt;
> +	struct dm_names *names;
> +	struct dev_event *dev_evt;
> +	int i;
> +
> +	if (!(dmt = libmp_dm_task_create(DM_DEVICE_LIST)))
> +		return -1;
> +
> +	dm_task_no_open_count(dmt);
> +
> +	if (!dm_task_run(dmt))
> +		goto fail;
> +
> +	if (!(names = dm_task_get_names(dmt)))
> +		goto fail;
> +
> +	pthread_mutex_lock(&waiter->events_lock);
> +	vector_foreach_slot(waiter->events, dev_evt, i)
> +		dev_evt->action = EVENT_REMOVE;
> +	while (names->dev) {
> +		uint32_t event_nr;
> +
> +		if (!dm_is_mpath(names->name))
> +			goto next;
> +
> +		event_nr = dm_event_nr(names);
> +		vector_foreach_slot(waiter->events, dev_evt, i) {
> +			if (!strcmp(dev_evt->name, names->name)) {
> +				if (event_nr != dev_evt->evt_nr) {
> +					dev_evt->evt_nr = event_nr;
> +					dev_evt->action =
> EVENT_UPDATE;
> +				} else
> +					dev_evt->action =
> EVENT_NOTHING;
> +				break;
> +			}
> +		}
> +next:
> +		if (!names->next)
> +			break;
> +		names = (void *)names + names->next;
> +	}
> +	pthread_mutex_unlock(&waiter->events_lock);
> +	dm_task_destroy(dmt);
> +	return 0;
> +
> +fail:
> +	dm_task_destroy(dmt);
> +	return -1;
> +}
> +
> +/* You must call update_multipath() after calling this function, to
> + * deal with any events that came in before the device was added */
> +int watch_dmevents(char *name)
> +{
> +	int event_nr;
> +	struct dev_event *dev_evt, *old_dev_evt;
> +	int i;
> +
> +	if (!dm_is_mpath(name)) {
> +		condlog(0, "%s: not a multipath device. can't watch
> events",
> +			name);
> +		return -1;
> +	}
> +
> +	if ((event_nr = dm_geteventnr(name)) < 0)
> +		return -1;
> +
> +	dev_evt = (struct dev_event *)malloc(sizeof(struct
> dev_event));
> +	if (!dev_evt) {
> +		condlog(0, "%s: can't allocate event waiter
> structure", name);
> +		return -1;
> +	}
> +
> +	strncpy(dev_evt->name, name, WWID_SIZE);
> +	dev_evt->name[WWID_SIZE - 1] = 0;

Nitpick: It might be better to use strlcpy or snprintf here.

> +	dev_evt->evt_nr = event_nr;
> +	dev_evt->action = EVENT_NOTHING;
> +
> +	pthread_mutex_lock(&waiter->events_lock);
> +	vector_foreach_slot(waiter->events, old_dev_evt, i){
> +		if (!strcmp(dev_evt->name, old_dev_evt->name)) {
> +			/* caller will be updating this device */
> +			old_dev_evt->evt_nr = event_nr;
> +			old_dev_evt->action = EVENT_NOTHING;
> +			pthread_mutex_unlock(&waiter->events_lock);
> +			condlog(2, "%s: already waiting for events
> on device",
> +				name);
> +			free(dev_evt);
> +			return 0;
> +		}
> +	}
> +	if (!vector_alloc_slot(waiter->events)) {
> +		pthread_mutex_unlock(&waiter->events_lock);
> +		free(dev_evt);
> +		return -1;
> +	}
> +	vector_set_slot(waiter->events, dev_evt);
> +	pthread_mutex_unlock(&waiter->events_lock);
> +	return 0;
> +}
> +
> +void unwatch_all_dmevents(void)
> +{
> +	struct dev_event *dev_evt;
> +	int i;
> +
> +	pthread_mutex_lock(&waiter->events_lock);
> +	vector_foreach_slot(waiter->events, dev_evt, i)
> +		free(dev_evt);
> +	vector_reset(waiter->events);
> +	pthread_mutex_unlock(&waiter->events_lock);
> +}
> +
> +static void unwatch_dmevents(char *name)
> +{
> +	struct dev_event *dev_evt;
> +	int i;
> +
> +	pthread_mutex_lock(&waiter->events_lock);
> +	vector_foreach_slot(waiter->events, dev_evt, i) {
> +		if (!strcmp(dev_evt->name, name)) {
> +			vector_del_slot(waiter->events, i);
> +			free(dev_evt);
> +			break;
> +		}
> +	}
> +	pthread_mutex_unlock(&waiter->events_lock);
> +}
> +
> +/*
> + * returns the reschedule delay
> + * negative means *stop*
> + */
> +
> +/* poll, arm, update, return */
> +static int dmevent_loop (void)
> +{
> +	int r, i = 0;
> +	struct pollfd pfd;
> +	struct dev_event *dev_evt;
> +
> +	pfd.fd = waiter->fd;
> +	pfd.events = POLLIN;
> +	r = poll(&pfd, 1, -1);
> +	if (r <= 0) {
> +		condlog(0, "failed polling for dm events: %s",
> strerror(errno));
> +		/* sleep 1s and hope things get better */
> +		return 1;
> +	}
> +
> +	if (arm_dm_event_poll(waiter->fd) != 0) {
> +		condlog(0, "Cannot re-arm event polling: %s",
> strerror(errno));
> +		/* sleep 1s and hope things get better */
> +		return 1;
> +	}
> +
> +	if (dm_get_events() != 0) {
> +		condlog(0, "failed getting dm events: %s",
> strerror(errno));
> +		/* sleep 1s and hope things get better */
> +		return 1;
> +	}
> +
> +	/*
> +	 * upon event ...
> +	 */
> +
> +	while (1) {
> +		int done = 1;
> +		struct dev_event curr_dev;
> +		struct multipath *mpp;
> +
> +		pthread_mutex_lock(&waiter->events_lock);
> +		vector_foreach_slot(waiter->events, dev_evt, i) {
> +			if (dev_evt->action != EVENT_NOTHING) {
> +				curr_dev = *dev_evt;
> +				if (dev_evt->action == EVENT_REMOVE)
> {
> +					vector_del_slot(waiter-
> >events, i);
> +					free(dev_evt);
> +				} else
> +					dev_evt->action =
> EVENT_NOTHING;
> +				done = 0;
> +				break;
> +			}
> +		}
> +		pthread_mutex_unlock(&waiter->events_lock);
> +		if (done)
> +			return 1;
> +
> +		condlog(3, "%s: devmap event #%i", curr_dev.name,
> +			curr_dev.evt_nr);
> +
> +		/*
> +		 * event might be :
> +		 *
> +		 * 1) a table reload, which means our mpp structure
> is
> +		 *    obsolete : refresh it through
> update_multipath()
> +		 * 2) a path failed by DM : mark as such through
> +		 *    update_multipath()
> +		 * 3) map has gone away : stop the thread.
> +		 * 4) a path reinstate : nothing to do
> +		 * 5) a switch group : nothing to do
> +		 */
> +		pthread_cleanup_push(cleanup_lock, &waiter->vecs-
> >lock);
> +		lock(&waiter->vecs->lock);
> +		pthread_testcancel();
> +		r = 0;
> +		if (curr_dev.action == EVENT_REMOVE) {
> +			mpp = find_mp_by_alias(waiter->vecs->mpvec,
> +					       curr_dev.name);
> +			if (mpp)
> +				remove_map(mpp, waiter->vecs, 1);
> +		} else
> +			r = update_multipath(waiter->vecs,
> curr_dev.name, 1);
> +		lock_cleanup_pop(&waiter->vecs->lock);
> +
> +		if (r) {
> +			condlog(2, "%s: stopped watching dmevents",
> +				curr_dev.name);
> +			unwatch_dmevents(curr_dev.name);
> +		}
> +	}
> +	condlog(0, "dmevent waiter thread unexpectedly quit");
> +	return -1; /* never reach there */
> +}
> +
> +static void rcu_unregister(void *param)
> +{
> +	rcu_unregister_thread();
> +}
> +
> +void *wait_dmevents (void *unused)
> +{
> +	int r;
> +
> +
> +	if (!waiter) {
> +		condlog(0, "dmevents waiter not intialized");
> +		return NULL;
> +	}
> +
> +	pthread_cleanup_push(rcu_unregister, NULL);
> +	rcu_register_thread();
> +	mlockall(MCL_CURRENT | MCL_FUTURE);
> +
> +	while (1) {
> +		r = dmevent_loop();
> +
> +		if (r < 0)
> +			break;
> +
> +		sleep(r);
> +	}
> +
> +	pthread_cleanup_pop(1);
> +	return NULL;
> +}
> diff --git a/multipathd/dmevents.h b/multipathd/dmevents.h
> new file mode 100644
> index 0000000..569e855
> --- /dev/null
> +++ b/multipathd/dmevents.h
> @@ -0,0 +1,13 @@
> +#ifndef _DMEVENTS_H
> +#define _DMEVENTS_H
> +
> +#include "structs_vec.h"
> +
> +int dmevent_poll_supported(void);
> +int alloc_dmevent_waiter(struct vectors *vecs);
> +void free_dmevent_waiter(void);
> +int watch_dmevents(char *name);
> +void unwatch_all_dmevents(void);
> +void *wait_dmevents (void *unused);
> +
> +#endif /* _DMEVENTS_H */
> diff --git a/multipathd/main.c b/multipathd/main.c
> index 2963bde..6dabf2c 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -82,6 +82,7 @@ static int use_watchdog;
>  #include "cli_handlers.h"
>  #include "lock.h"
>  #include "waiter.h"
> +#include "dmevents.h"
>  #include "io_err_stat.h"
>  #include "wwids.h"
>  #include "../third-party/valgrind/drd.h"
> @@ -108,6 +109,7 @@ int uxsock_timeout;
>  int verbosity;
>  int bindings_read_only;
>  int ignore_new_devs;
> +int poll_dmevents = 1;
>  enum daemon_status running_state = DAEMON_INIT;
>  pid_t daemon_pid;
>  pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER;
> @@ -288,11 +290,23 @@ switch_pathgroup (struct multipath * mpp)
>  		 mpp->alias, mpp->bestpg);
>  }
>  
> +static int
> +wait_for_events(struct multipath *mpp, struct vectors *vecs)
> +{
> +	if (poll_dmevents)
> +		return watch_dmevents(mpp->alias);
> +	else
> +		return start_waiter_thread(mpp, vecs);
> +}
> +
>  static void
>  remove_map_and_stop_waiter(struct multipath *mpp, struct vectors
> *vecs,
>  			   int purge_vec)
>  {
> -	stop_waiter_thread(mpp, vecs);
> +	/* devices are automatically removed by the dmevent polling
> code,
> +	 * so they don't need to be manually removed here */
> +	if (!poll_dmevents)
> +		stop_waiter_thread(mpp, vecs);
>  	remove_map(mpp, vecs, purge_vec);
>  }
>  
> @@ -305,8 +319,12 @@ remove_maps_and_stop_waiters(struct vectors
> *vecs)
>  	if (!vecs)
>  		return;
>  
> -	vector_foreach_slot(vecs->mpvec, mpp, i)
> -		stop_waiter_thread(mpp, vecs);
> +	if (!poll_dmevents) {
> +		vector_foreach_slot(vecs->mpvec, mpp, i)
> +			stop_waiter_thread(mpp, vecs);
> +	}
> +	else
> +		unwatch_all_dmevents();
>  
>  	remove_maps(vecs);
>  }
> @@ -351,7 +369,7 @@ retry:
>  	dm_lib_release();
>  
>  fail:
> -	if (new_map && (retries < 0 || start_waiter_thread(mpp,
> vecs))) {
> +	if (new_map && (retries < 0 || wait_for_events(mpp, vecs)))
> {
>  		condlog(0, "%s: failed to create new map", mpp-
> >alias);
>  		remove_map(mpp, vecs, 1);
>  		return 1;
> @@ -870,7 +888,7 @@ retry:
>  
>  	if ((mpp->action == ACT_CREATE ||
>  	     (mpp->action == ACT_NOTHING && start_waiter && !mpp-
> >waiter)) &&
> -	    start_waiter_thread(mpp, vecs))
> +	    wait_for_events(mpp, vecs))
>  			goto fail_map;
>  
>  	/*
> @@ -2173,7 +2191,7 @@ configure (struct vectors * vecs)
>  	 * start dm event waiter threads for these new maps
>  	 */
>  	vector_foreach_slot(vecs->mpvec, mpp, i) {
> -		if (start_waiter_thread(mpp, vecs)) {
> +		if (wait_for_events(mpp, vecs)) {
>  			remove_map(mpp, vecs, 1);
>  			i--;
>  			continue;
> @@ -2414,7 +2432,7 @@ set_oom_adj (void)
>  static int
>  child (void * param)
>  {
> -	pthread_t check_thr, uevent_thr, uxlsnr_thr, uevq_thr;
> +	pthread_t check_thr, uevent_thr, uxlsnr_thr, uevq_thr,
> dmevent_thr;
>  	pthread_attr_t log_attr, misc_attr, uevent_attr;
>  	struct vectors * vecs;
>  	struct multipath * mpp;
> @@ -2476,6 +2494,8 @@ child (void * param)
>  		goto failed;
>  	}
>  
> +	if (poll_dmevents)
> +		poll_dmevents = dmevent_poll_supported();
>  	setlogmask(LOG_UPTO(conf->verbosity + 3));
>  
>  	envp = getenv("LimitNOFILE");
> @@ -2542,6 +2562,19 @@ child (void * param)
>  
>  	init_path_check_interval(vecs);
>  
> +	if (poll_dmevents) {
> +		if (alloc_dmevent_waiter(vecs)) {
> +			condlog(0, "failed to allocate dmevents
> waiter info");
> +			goto failed;
> +		}
> +		if ((rc = pthread_create(&dmevent_thr, &misc_attr,
> +					 wait_dmevents, NULL))) {
> +			condlog(0, "failed to create dmevent waiter
> thread: %d",
> +				rc);
> +			goto failed;
> +		}
> +	}
> +
>  	/*
>  	 * Start uevent listener early to catch events
>  	 */
> @@ -2615,11 +2648,15 @@ child (void * param)
>  	pthread_cancel(uevent_thr);
>  	pthread_cancel(uxlsnr_thr);
>  	pthread_cancel(uevq_thr);
> +	if (poll_dmevents)
> +		pthread_cancel(dmevent_thr);
>  
>  	pthread_join(check_thr, NULL);
>  	pthread_join(uevent_thr, NULL);
>  	pthread_join(uxlsnr_thr, NULL);
>  	pthread_join(uevq_thr, NULL);
> +	if (poll_dmevents)
> +		pthread_join(dmevent_thr, NULL);
>  
>  	stop_io_err_stat_thread();
>  
> @@ -2634,6 +2671,8 @@ child (void * param)
>  
>  	cleanup_checkers();
>  	cleanup_prio();
> +	if (poll_dmevents)
> +		free_dmevent_waiter();
>  
>  	dm_lib_release();
>  	dm_lib_exit();
> @@ -2765,7 +2804,7 @@ main (int argc, char *argv[])
>  	udev = udev_new();
>  	libmp_udev_set_sync_support(0);
>  
> -	while ((arg = getopt(argc, argv, ":dsv:k::Bn")) != EOF ) {
> +	while ((arg = getopt(argc, argv, ":dsv:k::Bniw")) != EOF ) {
>  		switch(arg) {
>  		case 'd':
>  			foreground = 1;
> @@ -2799,6 +2838,9 @@ main (int argc, char *argv[])
>  		case 'n':
>  			ignore_new_devs = 1;
>  			break;
> +		case 'w':
> +			poll_dmevents = 0;
> +			break;
>  		default:
>  			fprintf(stderr, "Invalid argument '-%c'\n",
>  				optopt);

-- 
Dr. Martin Wilck <mwilck at suse.com>, Tel. +49 (0)911 74053 2107
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)




More information about the dm-devel mailing list