[Cluster-devel] [PATCH] qdiskd: Make multipath issues go away

Fabio M. Di Nitto fdinitto at redhat.com
Fri Feb 17 05:38:42 UTC 2012


The patch looks good and ACK, with one minor nitpick.

that 178 port value in cman_recv/send_data is rather cryptic.

I would prefer to see it defined as others in cman/cnxman-socket.h
for documentation purposes (so we know it's QDISK).

Fabio

On 02/16/2012 12:53 AM, Lon Hohberger wrote:
> Qdiskd hsitorically has required significant tuning to work around
> delays which occur during multipath failover, overloaded I/O, and LUN
> trespasses in both device-mapper-multipath and EMC PowerPath
> environments.
> 
> This patch goes a very long way towards eliminating false evictions
> when these conditions occur by making qdiskd whine to the other
> cluster members when it detects hung system calls.  When a cluster
> member whines, it indicates the source of the problem (which system
> call is hung), and the act of receiving a whine from a host indicates
> that qdiskd is operational, but that I/O is hung.  Hung I/O is different
> from losing storage entirely (where you get I/O errors).
> 
> Possible problems:
> 
> - Receive queue getting very full, causing messages to become blocked on
> a node where I/O is hung.  1) that would take a very long time, and 2)
> node should get evicted at that point anyway.
> 
> Resolves: rhbz#678372
> 
> Signed-off-by: Lon Hohberger <lhh at redhat.com>
> ---
>  cman/qdisk/Makefile  |    2 +-
>  cman/qdisk/disk.h    |    6 ++++++
>  cman/qdisk/iostate.c |   16 +++++++++++++---
>  cman/qdisk/iostate.h |    4 +++-
>  cman/qdisk/main.c    |   45 ++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 65 insertions(+), 8 deletions(-)
> 
> diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
> index 68e20cd..e3bb5f7 100644
> --- a/cman/qdisk/Makefile
> +++ b/cman/qdisk/Makefile
> @@ -40,7 +40,7 @@ ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
>  	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
>  
>  ${TARGET2}: ${SHAREDOBJS} ${OBJS2}
> -	$(CC) -o $@ $^ $(LDFLAGS)
> +	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
>  
>  depends:
>  	$(MAKE) -C ../lib all
> diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
> index 93d15fe..fd80fa6 100644
> --- a/cman/qdisk/disk.h
> +++ b/cman/qdisk/disk.h
> @@ -273,6 +273,12 @@ typedef struct {
>  	status_block_t ni_status;
>  } node_info_t;
>  
> +typedef struct {
> +	qd_ctx *ctx;
> +	node_info_t *ni;
> +	size_t ni_len;
> +} qd_priv_t;
> +
>  int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
>  		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
>  int qd_init(qd_ctx *ctx, cman_handle_t ch_admin,
> diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
> index 0199da4..06c6831 100644
> --- a/cman/qdisk/iostate.c
> +++ b/cman/qdisk/iostate.c
> @@ -1,9 +1,12 @@
>  #include <pthread.h>
> +#include <libcman.h>
>  #include <iostate.h>
>  #include <unistd.h>
>  #include <time.h>
>  #include <sys/time.h>
>  #include <liblogthread.h>
> +#include <stdint.h>
> +#include "platform.h"
>  #include "iostate.h"
>  
>  static iostate_t main_state = 0;
> @@ -26,7 +29,7 @@ static struct state_table io_state_table[] = {
>  {	STATE_LSEEK,	"seek"	},
>  {	-1,		NULL	} };
>  
> -static const char *
> +const char *
>  state_to_string(iostate_t state)
>  {
>  	static const char *ret = "unknown";
> @@ -65,6 +68,8 @@ io_nanny_thread(void *arg)
>  	iostate_t last_main_state = 0, current_main_state = 0;
>  	int last_main_incarnation = 0, current_main_incarnation = 0;
>  	int logged_incarnation = 0;
> +	cman_handle_t ch = (cman_handle_t)arg;
> +	int32_t whine_state;
>  
>  	/* Start with wherever we're at now */
>  	pthread_mutex_lock(&state_mutex);
> @@ -96,6 +101,11 @@ io_nanny_thread(void *arg)
>  			continue;
>  		}
>  
> +		/* Whine on CMAN api */
> +		whine_state = (int32_t)current_main_state;
> +		swab32(whine_state);
> +		cman_send_data(ch, &whine_state, sizeof(int32_t), 0, 178, 0);
> +
>  		/* Don't log things twice */
>  		if (logged_incarnation == current_main_incarnation)
>  			continue;
> @@ -114,7 +124,7 @@ io_nanny_thread(void *arg)
>  
>  
>  int
> -io_nanny_start(int timeout)
> +io_nanny_start(cman_handle_t ch, int timeout)
>  {
>  	int ret;
>  
> @@ -124,7 +134,7 @@ io_nanny_start(int timeout)
>  	qdisk_timeout = timeout;
>  	thread_active = 1;
>  
> -	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
> +	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
>  	pthread_mutex_unlock(&state_mutex);
>  
>  	return ret;
> diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
> index 7dd7bf6..a65b1d4 100644
> --- a/cman/qdisk/iostate.h
> +++ b/cman/qdisk/iostate.h
> @@ -11,7 +11,9 @@ typedef enum {
>  
>  void io_state(iostate_t state);
>  
> -int io_nanny_start(int timeout);
> +int io_nanny_start(cman_handle_t ch, int timeout);
>  int io_nanny_stop(void);
>  
> +const char * state_to_string(iostate_t state);
> +
>  #endif
> diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
> index a90e82c..d613d84 100644
> --- a/cman/qdisk/main.c
> +++ b/cman/qdisk/main.c
> @@ -915,7 +915,8 @@ cman_wait(cman_handle_t ch, struct timeval *_tv)
>  static void
>  process_cman_event(cman_handle_t handle, void *private, int reason, int arg)
>  {
> -	qd_ctx *ctx = (qd_ctx *)private;
> +	qd_priv_t *qp = (qd_priv_t *)private;
> +	qd_ctx *ctx = qp->ctx;
>  
>  	switch(reason) {
>  	case CMAN_REASON_PORTOPENED:
> @@ -1926,6 +1927,33 @@ check_stop_cman(qd_ctx *ctx)
>  do { static int _logged=0; if (!_logged) { _logged=1; logt_print(level, fmt, ##args); } } while(0)
>  
>  
> +static void
> +qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
> +	    uint8_t port, int nodeid)
> +{
> +	int32_t dstate;
> +	qd_priv_t *qp = (qd_priv_t *)privdata;
> +	node_info_t *ni = qp->ni;
> +
> +	if (len != sizeof(dstate)) {
> +		return;
> +	}
> +
> +	dstate = *((int32_t*)buf);
> +
> +	if (nodeid == (qp->ctx->qc_my_id))
> +		return;
> +
> +	swab32(dstate);
> +
> +	if (dstate) {
> +		logt_print(LOG_NOTICE, "qdiskd on node %d reports hung %s()\n", nodeid,
> +			   state_to_string(dstate));
> +		ni[nodeid-1].ni_misses = 0;
> +	}
> +}
> +
> +
>  int
>  main(int argc, char **argv)
>  {
> @@ -1939,6 +1967,7 @@ main(int argc, char **argv)
>  	char device[128];
>  	pid_t pid;
>  	quorum_header_t qh;
> +	qd_priv_t qp;
>  
>  	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
>  		printf("QDisk services already running\n");
> @@ -2001,13 +2030,23 @@ main(int argc, char **argv)
>  
>  	/* For cman notifications we need two sockets - one for events,
>  	   one for config change callbacks */
> -	ch_user = cman_init(&ctx);
> +	qp.ctx = &ctx;
> +	qp.ni = &ni[0];
> +	qp.ni_len = MAX_NODES_DISK;
> +
> +	ch_user = cman_init(&qp);
>          if (cman_start_notification(ch_user, process_cman_event) != 0) {
>  		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
>  			   strerror(errno));
>  		goto out;
>  	}
>  
> +	if (cman_start_recv_data(ch_user, qdisk_whine, 178) != 0) {
> +		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
> +			   strerror(errno));
> +		goto out;
> +	}
> +
>  	memset(&me, 0, sizeof(me));
>  	if (cman_get_node(ch_admin, CMAN_NODEID_US, &me) < 0) {
>  		logt_print(LOG_CRIT, "Could not determine local node ID: %s\n",
> @@ -2108,7 +2147,7 @@ main(int argc, char **argv)
>  		}
>  	}
>  
> -	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
> +	io_nanny_start(ch_user, ctx.qc_tko * ctx.qc_interval);
>  
>  	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) {
>  		/* Only clean up if we're exiting w/o error) */




More information about the Cluster-devel mailing list