[dm-devel] [PATCH v3] multipath-tools: intermittent IO error accounting to improve reliability

Tue Sep 5 02:59:00 UTC 2017

Hi Martin,
Thanks for you remarks and compiling check.
An new patch will be updated in this week.
Other response is inline.

Cheers
Guan Junxiong

On 2017/9/5 4:59, Martin Wilck wrote:
> Hi Guan,
> 
> some remarks below. This patch is quite big - I've done my best but
> this shouldn't be considered an extensive review.
> 
> On Mon, 2017-09-04 at 22:06 +0800, Guan Junxiong wrote:
>> This patch adds a new method of path state checking based on
>> accounting
>> IO error. This is useful in many scenarios such as intermittent IO
>> error
>> an a path due to network congestion, or a shaky link.
>>
>> Three parameters are added for the admin: "path_io_err_sample_time",
>> "path_io_err_rate_threshold" and "path_io_err_recovery_time".
>> If path_io_err_sample_time and path_io_err_recovery_time are set to a
>> value greater than 0, when a path fail event occurs due to an IO
>> error,
>> multipathd will enqueue this path into a queue of which each member
>> is
>> sent a couple of continuous direct reading asynchronous io at a fixed
>> sample rate of 10HZ. The IO accounting process for a path will last
>> for
>> path_io_err_sample_time. If the IO error rate on a particular path is
>> greater than the path_io_err_rate_threshold, then the path will not
>> reinstate for recover_time seconds unless there is only one active
>> path.
>>
>> If recover_time expires, we will reschedule this IO error checking
>> process. If the path is good enough, we will claim it good.
>>
>> This helps us place the path in delayed state if we hit a lot of
>> intermittent IO errors on a particular path due to network/target
>> issues and isolate such degraded path and allow the admin to rectify
>> the errors on a path.
>>
>> Signed-off-by: Junxiong Guan <guanjunxiong at huawei.com>
>> ---
>>
>> Changes from V2:
>> * fix uncondistional rescedule forverver
>> * use script/checkpatch.pl in Linux to cleanup informal coding style
>> * fix "continous" and "internel" typos
>>
>> Changes from V1:
>> * send continous IO instead of a single IO in a sample interval
>> (Martin)
>> * when recover time expires, we reschedule the checking process
>> (Hannes)
>> * Use the error rate threshold as a permillage instead of IO
>> number(Martin)
>> * Use a common io_context for libaio for all paths (Martin)
>> * Other small fixes (Martin)
>>
>>
>>  libmultipath/Makefile      |   5 +-
>>  libmultipath/config.h      |   9 +
>>  libmultipath/configure.c   |   3 +
>>  libmultipath/dict.c        |  41 +++
>>  libmultipath/io_err_stat.c | 717
>> +++++++++++++++++++++++++++++++++++++++++++++
>>  libmultipath/io_err_stat.h |  15 +
>>  libmultipath/propsel.c     |  53 ++++
>>  libmultipath/propsel.h     |   3 +
>>  libmultipath/structs.h     |   7 +
>>  libmultipath/uevent.c      |  36 ++-
>>  libmultipath/uevent.h      |   2 +
>>  multipath/multipath.conf.5 |  43 +++
>>  multipathd/main.c          |  56 ++++
>>  13 files changed, 986 insertions(+), 4 deletions(-)
>>  create mode 100644 libmultipath/io_err_stat.c
>>  create mode 100644 libmultipath/io_err_stat.h
>>
>> diff --git a/libmultipath/Makefile b/libmultipath/Makefile
>> index b3244fc7..dce73afe 100644
>> --- a/libmultipath/Makefile
>> +++ b/libmultipath/Makefile
>> @@ -9,7 +9,7 @@ LIBS = $(DEVLIB).$(SONAME)
>>  
>>  CFLAGS += $(LIB_CFLAGS) -I$(mpathcmddir)
>>  
>> -LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir)
>> -lmpathcmd -lurcu
>> +LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir)
>> -lmpathcmd -lurcu -laio
>>  
>>  ifdef SYSTEMD
>>  	CFLAGS += -DUSE_SYSTEMD=$(SYSTEMD)
>> @@ -42,7 +42,8 @@ OBJS = memory.o parser.o vector.o devmapper.o
>> callout.o \
>>  	pgpolicies.o debug.o defaults.o uevent.o time-util.o \
>>  	switchgroup.o uxsock.o print.o alias.o log_pthread.o \
>>  	log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
>> -	lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o
>> +	lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o \
>> +	io_err_stat.o
>>  
>>  all: $(LIBS)
>>  
>> diff --git a/libmultipath/config.h b/libmultipath/config.h
>> index ffc69b5f..215d29e9 100644
>> --- a/libmultipath/config.h
>> +++ b/libmultipath/config.h
>> @@ -75,6 +75,9 @@ struct hwentry {
>>  	int san_path_err_threshold;
>>  	int san_path_err_forget_rate;
>>  	int san_path_err_recovery_time;
>> +	int path_io_err_sample_time;
>> +	int path_io_err_rate_threshold;
>> +	int path_io_err_recovery_time;
>>  	int skip_kpartx;
>>  	int max_sectors_kb;
>>  	char * bl_product;
>> @@ -106,6 +109,9 @@ struct mpentry {
>>  	int san_path_err_threshold;
>>  	int san_path_err_forget_rate;
>>  	int san_path_err_recovery_time;
>> +	int path_io_err_sample_time;
>> +	int path_io_err_rate_threshold;
>> +	int path_io_err_recovery_time;
>>  	int skip_kpartx;
>>  	int max_sectors_kb;
>>  	uid_t uid;
>> @@ -155,6 +161,9 @@ struct config {
>>  	int san_path_err_threshold;
>>  	int san_path_err_forget_rate;
>>  	int san_path_err_recovery_time;
>> +	int path_io_err_sample_time;
>> +	int path_io_err_rate_threshold;
>> +	int path_io_err_recovery_time;
>>  	int uxsock_timeout;
>>  	int strict_timing;
>>  	int retrigger_tries;
>> diff --git a/libmultipath/configure.c b/libmultipath/configure.c
>> index 74b6f52a..81dc97d9 100644
>> --- a/libmultipath/configure.c
>> +++ b/libmultipath/configure.c
>> @@ -298,6 +298,9 @@ int setup_map(struct multipath *mpp, char
>> *params, int params_size)
>>  	select_san_path_err_threshold(conf, mpp);
>>  	select_san_path_err_forget_rate(conf, mpp);
>>  	select_san_path_err_recovery_time(conf, mpp);
>> +	select_path_io_err_sample_time(conf, mpp);
>> +	select_path_io_err_rate_threshold(conf, mpp);
>> +	select_path_io_err_recovery_time(conf, mpp);
>>  	select_skip_kpartx(conf, mpp);
>>  	select_max_sectors_kb(conf, mpp);
>>  
>> diff --git a/libmultipath/dict.c b/libmultipath/dict.c
>> index 9dc10904..18b1fdb1 100644
>> --- a/libmultipath/dict.c
>> +++ b/libmultipath/dict.c
>> @@ -1108,6 +1108,35 @@ declare_hw_handler(san_path_err_recovery_time,
>> set_off_int_undef)
>>  declare_hw_snprint(san_path_err_recovery_time, print_off_int_undef)
>>  declare_mp_handler(san_path_err_recovery_time, set_off_int_undef)
>>  declare_mp_snprint(san_path_err_recovery_time, print_off_int_undef)
>> +declare_def_handler(path_io_err_sample_time, set_off_int_undef)
>> +declare_def_snprint_defint(path_io_err_sample_time,
>> print_off_int_undef,
>> +			   DEFAULT_ERR_CHECKS)
>> +declare_ovr_handler(path_io_err_sample_time, set_off_int_undef)
>> +declare_ovr_snprint(path_io_err_sample_time, print_off_int_undef)
>> +declare_hw_handler(path_io_err_sample_time, set_off_int_undef)
>> +declare_hw_snprint(path_io_err_sample_time, print_off_int_undef)
>> +declare_mp_handler(path_io_err_sample_time, set_off_int_undef)
>> +declare_mp_snprint(path_io_err_sample_time, print_off_int_undef)
>> +declare_def_handler(path_io_err_rate_threshold, set_off_int_undef)
>> +declare_def_snprint_defint(path_io_err_rate_threshold,
>> print_off_int_undef,
>> +			   DEFAULT_ERR_CHECKS)
>> +declare_ovr_handler(path_io_err_rate_threshold, set_off_int_undef)
>> +declare_ovr_snprint(path_io_err_rate_threshold, print_off_int_undef)
>> +declare_hw_handler(path_io_err_rate_threshold, set_off_int_undef)
>> +declare_hw_snprint(path_io_err_rate_threshold, print_off_int_undef)
>> +declare_mp_handler(path_io_err_rate_threshold, set_off_int_undef)
>> +declare_mp_snprint(path_io_err_rate_threshold, print_off_int_undef)
>> +declare_def_handler(path_io_err_recovery_time, set_off_int_undef)
>> +declare_def_snprint_defint(path_io_err_recovery_time,
>> print_off_int_undef,
>> +			   DEFAULT_ERR_CHECKS)
>> +declare_ovr_handler(path_io_err_recovery_time, set_off_int_undef)
>> +declare_ovr_snprint(path_io_err_recovery_time, print_off_int_undef)
>> +declare_hw_handler(path_io_err_recovery_time, set_off_int_undef)
>> +declare_hw_snprint(path_io_err_recovery_time, print_off_int_undef)
>> +declare_mp_handler(path_io_err_recovery_time, set_off_int_undef)
>> +declare_mp_snprint(path_io_err_recovery_time, print_off_int_undef)
>> +
>> +
>>  static int
>>  def_uxsock_timeout_handler(struct config *conf, vector strvec)
>>  {
>> @@ -1443,6 +1472,9 @@ init_keywords(vector keywords)
>>  	install_keyword("san_path_err_threshold",
>> &def_san_path_err_threshold_handler,
>> &snprint_def_san_path_err_threshold);
>>  	install_keyword("san_path_err_forget_rate",
>> &def_san_path_err_forget_rate_handler,
>> &snprint_def_san_path_err_forget_rate);
>>  	install_keyword("san_path_err_recovery_time",
>> &def_san_path_err_recovery_time_handler,
>> &snprint_def_san_path_err_recovery_time);
>> +	install_keyword("path_io_err_sample_time",
>> &def_path_io_err_sample_time_handler,
>> &snprint_def_path_io_err_sample_time);
>> +	install_keyword("path_io_err_rate_threshold",
>> &def_path_io_err_rate_threshold_handler,
>> &snprint_def_path_io_err_rate_threshold);
>> +	install_keyword("path_io_err_recovery_time",
>> &def_path_io_err_recovery_time_handler,
>> &snprint_def_path_io_err_recovery_time);
>>  
>>  	install_keyword("find_multipaths",
>> &def_find_multipaths_handler, &snprint_def_find_multipaths);
>>  	install_keyword("uxsock_timeout",
>> &def_uxsock_timeout_handler, &snprint_def_uxsock_timeout);
>> @@ -1530,6 +1562,9 @@ init_keywords(vector keywords)
>>  	install_keyword("san_path_err_threshold",
>> &hw_san_path_err_threshold_handler,
>> &snprint_hw_san_path_err_threshold);
>>  	install_keyword("san_path_err_forget_rate",
>> &hw_san_path_err_forget_rate_handler,
>> &snprint_hw_san_path_err_forget_rate);
>>  	install_keyword("san_path_err_recovery_time",
>> &hw_san_path_err_recovery_time_handler,
>> &snprint_hw_san_path_err_recovery_time);
>> +	install_keyword("path_io_err_sample_time",
>> &hw_path_io_err_sample_time_handler,
>> &snprint_hw_path_io_err_sample_time);
>> +	install_keyword("path_io_err_rate_threshold",
>> &hw_path_io_err_rate_threshold_handler,
>> &snprint_hw_path_io_err_rate_threshold);
>> +	install_keyword("path_io_err_recovery_time",
>> &hw_path_io_err_recovery_time_handler,
>> &snprint_hw_path_io_err_recovery_time);
>>  	install_keyword("skip_kpartx", &hw_skip_kpartx_handler,
>> &snprint_hw_skip_kpartx);
>>  	install_keyword("max_sectors_kb",
>> &hw_max_sectors_kb_handler, &snprint_hw_max_sectors_kb);
>>  	install_sublevel_end();
>> @@ -1563,6 +1598,9 @@ init_keywords(vector keywords)
>>  	install_keyword("san_path_err_threshold",
>> &ovr_san_path_err_threshold_handler,
>> &snprint_ovr_san_path_err_threshold);
>>  	install_keyword("san_path_err_forget_rate",
>> &ovr_san_path_err_forget_rate_handler,
>> &snprint_ovr_san_path_err_forget_rate);
>>  	install_keyword("san_path_err_recovery_time",
>> &ovr_san_path_err_recovery_time_handler,
>> &snprint_ovr_san_path_err_recovery_time);
>> +	install_keyword("path_io_err_sample_time",
>> &ovr_path_io_err_sample_time_handler,
>> &snprint_ovr_path_io_err_sample_time);
>> +	install_keyword("path_io_err_rate_threshold",
>> &ovr_path_io_err_rate_threshold_handler,
>> &snprint_ovr_path_io_err_rate_threshold);
>> +	install_keyword("path_io_err_recovery_time",
>> &ovr_path_io_err_recovery_time_handler,
>> &snprint_ovr_path_io_err_recovery_time);
>>  
>>  	install_keyword("skip_kpartx", &ovr_skip_kpartx_handler,
>> &snprint_ovr_skip_kpartx);
>>  	install_keyword("max_sectors_kb",
>> &ovr_max_sectors_kb_handler, &snprint_ovr_max_sectors_kb);
>> @@ -1595,6 +1633,9 @@ init_keywords(vector keywords)
>>  	install_keyword("san_path_err_threshold",
>> &mp_san_path_err_threshold_handler,
>> &snprint_mp_san_path_err_threshold);
>>  	install_keyword("san_path_err_forget_rate",
>> &mp_san_path_err_forget_rate_handler,
>> &snprint_mp_san_path_err_forget_rate);
>>  	install_keyword("san_path_err_recovery_time",
>> &mp_san_path_err_recovery_time_handler,
>> &snprint_mp_san_path_err_recovery_time);
>> +	install_keyword("path_io_err_sample_time",
>> &mp_path_io_err_sample_time_handler,
>> &snprint_mp_path_io_err_sample_time);
>> +	install_keyword("path_io_err_rate_threshold",
>> &mp_path_io_err_rate_threshold_handler,
>> &snprint_mp_path_io_err_rate_threshold);
>> +	install_keyword("path_io_err_recovery_time",
>> &mp_path_io_err_recovery_time_handler,
>> &snprint_mp_path_io_err_recovery_time);
>>  	install_keyword("skip_kpartx", &mp_skip_kpartx_handler,
>> &snprint_mp_skip_kpartx);
>>  	install_keyword("max_sectors_kb",
>> &mp_max_sectors_kb_handler, &snprint_mp_max_sectors_kb);
>>  	install_sublevel_end();
>> diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c
>> new file mode 100644
>> index 00000000..5b41cb35
>> --- /dev/null
>> +++ b/libmultipath/io_err_stat.c
>> @@ -0,0 +1,717 @@
>> +/*
>> + * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
>> + *
>> + * io_err_stat.c
>> + * version 1.0
>> + *
>> + * IO error stream statistic process for path failure event from
>> kernel
>> + *
>> + * Author(s): Guan Junxiong 2017 <guanjunxiong at huawei.com>
>> + *
>> + * This file is released under the GPL version 2, or any later
>> version.
>> + */
>> +
>> +#include <unistd.h>
>> +#include <pthread.h>
>> +#include <signal.h>
>> +#include <fcntl.h>
>> +#include <sys/stat.h>
>> +#include <sys/ioctl.h>
>> +#include <linux/fs.h>
>> +#include <libaio.h>
>> +#include <errno.h>
>> +#include <sys/mman.h>
>> +
>> +#include "vector.h"
>> +#include "memory.h"
>> +#include "checkers.h"
>> +#include "config.h"
>> +#include "structs.h"
>> +#include "structs_vec.h"
>> +#include "devmapper.h"
>> +#include "debug.h"
>> +#include "lock.h"
>> +#include "time-util.h"
>> +#include "io_err_stat.h"
>> +
>> +#define IOTIMEOUT_SEC			60
>> +#define FLAKY_PATHFAIL_THRESHOLD	2
>> +#define FLAKY_PATHFAIL_TIME_FRAME	60
>> +#define CONCUR_NR_EVENT			32
>> +
>> +#define io_err_stat_log(prio, fmt, args...) \
>> +	condlog(prio, "io error statistic: " fmt, ##args)
>> +
>> +
>> +struct io_err_stat_pathvec {
>> +	pthread_mutex_t mutex;
>> +	vector		pathvec;
>> +};
>> +
>> +struct dio_ctx {
>> +	struct timespec	io_starttime;
>> +	int		blksize;
>> +	unsigned char	*buf;
>> +	unsigned char	*ptr;
>> +	struct iocb	io;
>> +};
>> +
>> +struct io_err_stat_path {
>> +	char		devname[FILE_NAME_SIZE];
>> +	int		fd;
>> +	struct dio_ctx	**dio_ctx_array;
>> +	int		io_err_nr;
>> +	int		io_nr;
>> +	struct timespec	start_time;
>> +
>> +	int		total_time;
>> +	int		err_rate_threshold;
>> +};
>> +
>> +pthread_t		io_err_stat_thr;
>> +pthread_attr_t		io_err_stat_attr;
>> +
>> +static struct io_err_stat_pathvec *paths;
>> +struct vectors *vecs;
>> +io_context_t	ioctx;
>> +
>> +static void cancel_inflight_io(struct io_err_stat_path *pp);
>> +
>> +static void rcu_unregister(void *param)
>> +{
>> +	rcu_unregister_thread();
>> +}
>> +
>> +struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char
>> *dev)
>> +{
>> +	int i;
>> +	struct io_err_stat_path *pp;
>> +
>> +	if (!pathvec)
>> +		return NULL;
>> +	vector_foreach_slot(pathvec, pp, i)
>> +		if (!strcmp(pp->devname, dev))
>> +			return pp;
>> +
>> +	io_err_stat_log(4, "%s: not found in check queue", dev);
>> +
>> +	return NULL;
>> +}
>> +
>> +static int init_each_dio_ctx(struct dio_ctx *ct, int blksize,
>> +		unsigned long pgsize)
>> +{
>> +	ct->blksize = blksize;
>> +	ct->buf = (unsigned char *)MALLOC(blksize + pgsize);
>> +	if (!ct->buf)
>> +		return 1;
>> +	ct->ptr = (unsigned char *)(((unsigned long)ct->buf +
>> +				pgsize - 1) & (~(pgsize - 1)));
> 
> 
> This looks to me as if you should rather use posix_memalign().
> 
Great, I will alter this.

>> +	ct->io_starttime.tv_sec = 0;
>> +	ct->io_starttime.tv_nsec = 0;
>> +
>> +	return 0;
>> +}
>> +
>> +static void deinit_each_dio_ctx(struct dio_ctx *ct)
>> +{
>> +	if (ct->buf)
>> +		FREE(ct->buf);
>> +}
>> +
>> +static int setup_directio_ctx(struct io_err_stat_path *p)
>> +{
>> +	unsigned long pgsize = getpagesize();
>> +	char fpath[FILE_NAME_SIZE+6];
>> +	int blksize;
>> +	int i;
>> +
>> +	snprintf(fpath, FILE_NAME_SIZE, "/dev/%s", p->devname);
> 
> My compiler spits out warnings on this one. Please use PATH_MAX for the
> length of "fpath" rather than FILE_NAME_SIZE.
> 

Great, I will alter this.

>> +	if (p->fd < 0)
>> +		p->fd = open(fpath, O_RDONLY | O_DIRECT);
>> +	if (p->fd < 0)
>> +		return 1;
>> +
>> +	p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx *) *
>> CONCUR_NR_EVENT);
>> +	if (!p->dio_ctx_array)
>> +		goto fail_close;
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++) {
>> +		p->dio_ctx_array[i] = MALLOC(sizeof(struct
>> dio_ctx));
>> +		if (!p->dio_ctx_array[i])
>> +			goto free_pdctx;
>> +	}
> 
> Why don't you simply make dio_ctx_array an array of dio_ctx (rather
> than dio_ctx*) and allocate CONCUR_N_EVENT*sizeof(dio_ctx)?
> 

An array of dio_ct look simple and clean, I will alter this.

>> +
>> +	if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) {
>> +		io_err_stat_log(4, "%s:cannot get blocksize, set
>> default 512",
>> +				p->devname);
>> +		blksize = 512;
>> +	}
>> +	/*
>> +	 * Sanity check for block size
>> +	 */
>> +	if (blksize > 4096)
>> +		blksize = 4096;
> 
> I think this isn't necessary.
> 

Great, I will drop this.

>> +	if (!blksize)
>> +		goto free_pdctx;
>> +
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++) {
>> +		if (init_each_dio_ctx(p->dio_ctx_array[i], blksize,
>> pgsize))
>> +			goto deinit;
>> +	}
>> +	return 0;
>> +
>> +deinit:
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++)
>> +		deinit_each_dio_ctx(p->dio_ctx_array[i]);
>> +free_pdctx:
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++) {
>> +		if (p->dio_ctx_array[i])
>> +			FREE(p->dio_ctx_array[i]);
>> +	}
>> +	FREE(p->dio_ctx_array);
>> +fail_close:
>> +	close(p->fd);
>> +
>> +	return 1;
>> +}
>> +
>> +static void destroy_directio_ctx(struct io_err_stat_path *p)
>> +{
>> +	int i;
>> +
>> +	if (!p || !p->dio_ctx_array)
>> +		return;
>> +	cancel_inflight_io(p);
>> +
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++) {
>> +		deinit_each_dio_ctx(p->dio_ctx_array[i]);
>> +		FREE(p->dio_ctx_array[i]);
>> +	}
>> +
>> +	FREE(p->dio_ctx_array);
>> +	if (p->fd > 0)
>> +		close(p->fd);
>> +}
>> +
>> +static struct io_err_stat_path *alloc_io_err_stat_path(void)
>> +{
>> +	struct io_err_stat_path *p;
>> +
>> +	p = (struct io_err_stat_path *)MALLOC(sizeof(*p));
>> +	if (!p)
>> +		return NULL;
>> +
>> +	memset(p->devname, 0, sizeof(p->devname));
>> +	p->io_err_nr = 0;
>> +	p->io_nr = 0;
>> +	p->total_time = 0;
>> +	p->start_time.tv_sec = 0;
>> +	p->start_time.tv_nsec = 0;
>> +	p->err_rate_threshold = 0;
>> +	p->fd = -1;
>> +
>> +	return p;
>> +}
>> +
>> +static void free_io_err_stat_path(struct io_err_stat_path *p)
>> +{
>> +	FREE(p);
>> +}
>> +
>> +static struct io_err_stat_pathvec *alloc_pathvec(void)
>> +{
>> +	struct io_err_stat_pathvec *p;
>> +	int r;
>> +
>> +	p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p));
>> +	if (!p)
>> +		return NULL;
>> +	p->pathvec = vector_alloc();
>> +	if (!p->pathvec)
>> +		goto out_free_struct_pathvec;
>> +	r = pthread_mutex_init(&p->mutex, NULL);
>> +	if (r)
>> +		goto out_free_member_pathvec;
>> +
>> +	return p;
>> +
>> +out_free_member_pathvec:
>> +	vector_free(p->pathvec);
>> +out_free_struct_pathvec:
>> +	FREE(p);
>> +	return NULL;
>> +}
>> +
>> +static void free_io_err_pathvec(struct io_err_stat_pathvec *p)
>> +{
>> +	struct io_err_stat_path *path;
>> +	int i;
>> +
>> +	if (!p)
>> +		return;
>> +	pthread_mutex_destroy(&p->mutex);
>> +	if (!p->pathvec) {
>> +		vector_foreach_slot(p->pathvec, path, i) {
>> +			destroy_directio_ctx(path);
>> +			free_io_err_stat_path(path);
>> +		}
>> +		vector_free(p->pathvec);
>> +	}
>> +	FREE(p);
>> +}
>> +
>> +/*
>> + * return value
>> + * 0: enqueue OK
>> + * 1: fails because of internal error
>> + * 2: fails because of existing already
>> + */
>> +static int enqueue_io_err_stat_by_path(struct path *path)
>> +{
>> +	struct io_err_stat_path *p;
>> +
>> +	pthread_mutex_lock(&paths->mutex);
>> +	p = find_err_path_by_dev(paths->pathvec, path->dev);
>> +	if (p) {
>> +		pthread_mutex_unlock(&paths->mutex);
>> +		return 2;
>> +	}
>> +	pthread_mutex_unlock(&paths->mutex);
>> +
>> +	p = alloc_io_err_stat_path();
>> +	if (!p)
>> +		return 1;
>> +
>> +	memcpy(p->devname, path->dev, sizeof(p->devname));
>> +	p->total_time = path->mpp->path_io_err_sample_time;
>> +	p->err_rate_threshold = path->mpp-
>>> path_io_err_rate_threshold;
>> +
>> +	if (setup_directio_ctx(p))
>> +		goto free_ioerr_path;
>> +	pthread_mutex_lock(&paths->mutex);
>> +	if (!vector_alloc_slot(paths->pathvec))
>> +		goto unlock_destroy;
>> +	vector_set_slot(paths->pathvec, p);
>> +	pthread_mutex_unlock(&paths->mutex);
>> +
>> +	io_err_stat_log(2, "%s: enqueue path %s to check",
>> +			path->mpp->alias, path->dev);
> 
> So you chose not to fail the path in the kernel for the time of the
> test. As I noted previously, that might make the test more reliable. 
> Of course you shouldn't do it if this is the last remaining active
> path, but in that case I don't think you should even start testing.
> 
Great, I will adopt this.

I chosen not to fail the path in the kernel because I wanted this
test to work in parallel with the san_path_err_XXX feature because
both use two saperate flags to control the check_path() routine.

Now, the san_path_err_XXX feature will be reused for flakiness
pre-checking.

>> +	return 0;
>> +
>> +unlock_destroy:
>> +	pthread_mutex_unlock(&paths->mutex);
>> +	destroy_directio_ctx(p);
>> +free_ioerr_path:
>> +	free_io_err_stat_path(p);
>> +
>> +	return 1;
>> +}
>> +
>> +int io_err_stat_handle_pathfail(struct path *path)
>> +{
>> +	struct timespec curr_time;
>> +
>> +	if (path->io_err_disable_reinstate) {
>> +		io_err_stat_log(3, "%s: reinstate is already
>> disabled",
>> +				path->dev);
>> +		return 1;
>> +	}
>> +	if (path->io_err_pathfail_cnt == -1)
>> +		return 1;
>> +
>> +	if (!path->mpp)
>> +		return 1;
>> +	if (path->mpp->path_io_err_sample_time <= 0 ||
>> +		path->mpp->path_io_err_recovery_time <= 0 ||
>> +		path->mpp->path_io_err_rate_threshold < 0) {
>> +		io_err_stat_log(4, "%s: parameter not set", path-
>>> mpp->alias);
>> +		return 1;
>> +	}
>> +
>> +	/*
>> +	 * The test should only be started for paths that have
>> failed
>> +	 * repeatedly in a certain time frame, so that we have
>> reason
>> +	 * to assume they're flaky. Without bother the admin to
>> configure
>> +	 * the repeated count threshold and time frame, we assume a
>> path
>> +	 * which fails at least twice within 60 seconds is flaky.
>> +	 */
>> +	if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
>> +		return 1;
>> +	if (path->io_err_pathfail_cnt == 0) {
>> +		path->io_err_pathfail_cnt++;
>> +		path->io_err_pathfail_starttime = curr_time.tv_sec;
>> +		io_err_stat_log(5, "%s: start path flakiness pre-
>> checking",
>> +				path->dev);
>> +		return 0;
>> +	}
>> +	if ((curr_time.tv_sec - path->io_err_pathfail_starttime) >
>> +			FLAKY_PATHFAIL_TIME_FRAME) {
>> +		path->io_err_pathfail_cnt = 0;
>> +		path->io_err_pathfail_starttime = curr_time.tv_sec;
>> +		io_err_stat_log(5, "%s: restart path flakiness pre-
>> checking",
>> +				path->dev);
>> +	}
>> +	path->io_err_pathfail_cnt++;
>> +	if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
>> +		path->io_err_pathfail_cnt = -1;
>> +		return enqueue_io_err_stat_by_path(path);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +int hit_io_err_recovery_time(struct path *pp)
>> +{
>> +	struct timespec curr_time;
>> +	int r;
>> +
>> +	if (pp->io_err_disable_reinstate == 0)
>> +		return 1;
>> +	if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
>> +		return 1;
>> +	if (pp->mpp->nr_active <= 0) {
>> +		io_err_stat_log(2, "%s: recover path early", pp-
>>> dev);
>> +		goto recover;
>> +	}
>> +	if ((curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
>> +			pp->mpp->path_io_err_recovery_time) {
>> +		io_err_stat_log(4, "%s: reschedule checking after %d
>> seconds",
>> +				pp->dev, pp->mpp-
>>> path_io_err_sample_time);
>> +		/*
>> +		 * to reschedule io error checking again
>> +		 * if the path is good enough, we claim it is good
>> +		 * and can be reinsated as soon as possible in the
>> +		 * check_path routine.
>> +		 */
>> +		pp->io_err_pathfail_cnt = -1;
>> +		pp->io_err_dis_reinstate_time = curr_time.tv_sec;
>> +		r = enqueue_io_err_stat_by_path(pp);
>> +		/*
>> +		 * Enqueue fails because of internal error.
>> +		 * In this case , we recover this path
>> +		 * Or else,  return hi to set path state to
>> PATH_DELAYED
>> +		 */
>> +		if (r == 1) {
>> +			io_err_stat_log(3, "%s: enqueue fails, to
>> recover",
>> +					pp->dev);
>> +			goto recover;
>> +		}
>> +	}
>> +
>> +	return 1;
>> +
>> +recover:
>> +	pp->io_err_pathfail_cnt = 0;
>> +	pp->io_err_disable_reinstate = 0;
>> +	pp->tick = 1;
>> +	return 0;
>> +}
>> +
>> +static int delete_io_err_stat_by_addr(struct io_err_stat_path *p)
>> +{
>> +	int i;
>> +
>> +	i = find_slot(paths->pathvec, p);
>> +	if (i != -1)
>> +		vector_del_slot(paths->pathvec, i);
>> +
>> +	destroy_directio_ctx(p);
>> +	free_io_err_stat_path(p);
>> +
>> +	return 0;
>> +}
>> +
>> +static void account_async_io_state(struct io_err_stat_path *pp, int
>> rc)
>> +{
>> +	switch (rc) {
>> +	case PATH_DOWN:
>> +	case PATH_TIMEOUT:
>> +		pp->io_err_nr++;
>> +		break;
>> +	case PATH_UNCHECKED:
>> +	case PATH_UP:
>> +	case PATH_PENDING:
>> +		break;
>> +	default:
>> +		break;
>> +	}
>> +}
>> +
>> +static int poll_io_err_stat(struct vectors *vecs, struct
>> io_err_stat_path *pp)
>> +{
>> +	struct timespec currtime, difftime;
>> +	struct path *path;
>> +	double err_rate;
>> +
>> +	if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
>> +		return 1;
>> +	timespecsub(&currtime, &pp->start_time, &difftime);
>> +	if (difftime.tv_sec < pp->total_time)
>> +		return 0;
>> +
>> +	io_err_stat_log(4, "check end for %s", pp->devname);
>> +
>> +	err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) /
>> pp->io_nr;
>> +	io_err_stat_log(5, "%s: IO error rate (%.1f/1000)",
>> +			pp->devname, err_rate);
>> +	pthread_cleanup_push(cleanup_lock, &vecs->lock);
>> +	lock(&vecs->lock);
>> +	pthread_testcancel();
>> +	path = find_path_by_dev(vecs->pathvec, pp->devname);
>> +	if (!path) {
>> +		io_err_stat_log(4, "path %s not found'", pp-
>>> devname);
>> +	} else if (err_rate <= pp->err_rate_threshold) {
>> +		path->io_err_pathfail_cnt = 0;
>> +		path->io_err_disable_reinstate = 0;
>> +		io_err_stat_log(4, "%s: (%d/%d) good to enable
>> reinstating",
>> +				pp->devname, pp->io_err_nr, pp-
>>> io_nr);
>> +	} else if (path->mpp && path->mpp->nr_active > 1) {
>> +		dm_fail_path(path->mpp->alias, path->dev_t);
>> +		update_queue_mode_del_path(path->mpp);
>> +		io_err_stat_log(2, "%s: proactively fail dm path
>> %s",
>> +				path->mpp->alias, path->dev);
>> +		path->io_err_disable_reinstate = 1;
>> +		path->io_err_dis_reinstate_time = currtime.tv_sec;
>> +		io_err_stat_log(3, "%s: to disable %s to reinstate",
>> +				path->mpp->alias, path->dev);
>> +
>> +		/*
>> +		 * schedule path check as soon as possible to
>> +		 * update path state to delayed state
>> +		 */
>> +		path->tick = 1;
>> +	} else {
>> +		path->io_err_pathfail_cnt = 0;
>> +		path->io_err_disable_reinstate = 0;
>> +		io_err_stat_log(4, "%s: there is orphan path, enable
>> reinstating",
>> +				pp->devname);
>> +	}
>> +	lock_cleanup_pop(vecs->lock);
>> +
>> +	delete_io_err_stat_by_addr(pp);
>> +
>> +	return 0;
>> +}
>> +
>> +static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev)
>> +{
>> +	int rc = -1;
>> +
>> +	if (ct->io_starttime.tv_nsec == 0 &&
>> +			ct->io_starttime.tv_sec == 0) {
>> +		struct iocb *ios[1] = { &ct->io };
>> +
>> +		if (clock_gettime(CLOCK_MONOTONIC, &ct-
>>> io_starttime) != 0) {
>> +			ct->io_starttime.tv_sec = 0;
>> +			ct->io_starttime.tv_nsec = 0;
>> +			return rc;
>> +		}
>> +		io_prep_pread(&ct->io, fd, ct->ptr, ct->blksize, 0);
>> +		if (io_submit(ioctx, 1, ios) != 1) {
>> +			io_err_stat_log(5, "%s: io_submit error %i",
>> +					dev, errno);
>> +			return rc;
>> +		}
>> +	}
>> +	rc = 0;
>> +
>> +	return rc;
>> +}
> 
> You return 0 here both if you submit IO and if you don't even try
> because io_starttime is already set ...
> 

Oh oh, this is a real bug. Thanks. I will fix this.

>> +
>> +static void send_batch_async_ios(struct io_err_stat_path *pp)
>> +{
>> +	int i;
>> +	struct dio_ctx *ct;
>> +
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++) {
>> +		ct = pp->dio_ctx_array[i];
>> +		if (!send_each_async_io(ct, pp->fd, pp->devname))
>> +			pp->io_nr++;
> 
> ... and here you increase io_nr for rc == 0. Is that correct?
> 

I should make sure send_each_async_io return 0 only and if only
io_submit success.

>> +	}
>> +	if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec ==
>> 0 &&
>> +		clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) {
>> +		pp->start_time.tv_sec = 0;
>> +		pp->start_time.tv_nsec = 0;
>> +	}
>> +}
>> +
>> +static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct
>> timespec *t,
>> +		char *dev)
>> +{
>> +	struct timespec	difftime;
>> +	struct io_event	event;
>> +	int		rc = PATH_UNCHECKED;
>> +	int		r;
>> +
>> +	timespecsub(t, &ct->io_starttime, &difftime);
>> +	if (difftime.tv_sec > IOTIMEOUT_SEC) {
>> +		struct iocb *ios[1] = { &ct->io };
>> +
>> +		io_err_stat_log(5, "%s: abort check on timeout",
>> dev);
>> +		r = io_cancel(ioctx, ios[0], &event);
>> +		if (r) {
>> +			io_err_stat_log(5, "%s: io_cancel error %i",
>> +					dev, errno);
>> +		} else {
>> +			ct->io_starttime.tv_sec = 0;
>> +			ct->io_starttime.tv_nsec = 0;
>> +		}
>> +		rc = PATH_TIMEOUT;
>> +	} else {
>> +		rc = PATH_PENDING;
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static void handle_async_io_timeout(void)
>> +{
>> +	struct io_err_stat_path *pp;
>> +	struct timespec curr_time;
>> +	int		rc = PATH_UNCHECKED;
>> +	int		i, j;
>> +
>> +	if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
>> +		return;
>> +	vector_foreach_slot(paths->pathvec, pp, i) {
>> +		for (j = 0; j < CONCUR_NR_EVENT; j++) {
>> +			rc = try_to_cancel_timeout_io(pp-
>>> dio_ctx_array[j],
>> +					&curr_time, pp->devname);
>> +			account_async_io_state(pp, rc);
>> +		}
>> +	}
>> +}
>> +
>> +static void cancel_inflight_io(struct io_err_stat_path *pp)
>> +{
>> +	struct io_event event;
>> +	int i, r;
>> +
>> +	for (i = 0; i < CONCUR_NR_EVENT; i++) {
>> +		struct dio_ctx *ct = pp->dio_ctx_array[i];
>> +		struct iocb *ios[1] = { &ct->io };
>> +
>> +		if (ct->io_starttime.tv_sec == 0
>> +				&& ct->io_starttime.tv_nsec == 0)
>> +			continue;
>> +		io_err_stat_log(5, "%s: abort infligh io",
>> +				pp->devname);
>> +		r = io_cancel(ioctx, ios[0], &event);
>> +		if (r) {
>> +			io_err_stat_log(5, "%s: io_cancel error %d,
>> %i",
>> +					pp->devname, r, errno);
>> +		} else {
>> +			ct->io_starttime.tv_sec = 0;
>> +			ct->io_starttime.tv_nsec = 0;
>> +		}
>> +	}
>> +}
>> +
>> +static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct
>> io_event *ev)
>> +{
>> +	ct->io_starttime.tv_sec = 0;
>> +	ct->io_starttime.tv_nsec = 0;
>> +	return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN;
>> +}
>> +
>> +static void handle_async_io_done_event(struct io_event *io_evt)
>> +{
>> +	struct io_err_stat_path *pp;
>> +	struct dio_ctx *ct;
>> +	int rc = PATH_UNCHECKED;
>> +	int i, j;
>> +
>> +	vector_foreach_slot(paths->pathvec, pp, i) {
>> +		for (j = 0; j < CONCUR_NR_EVENT; j++) {
>> +			ct = pp->dio_ctx_array[j];
>> +			if (&ct->io == io_evt->obj) {
>> +				rc = handle_done_dio_ctx(ct,
>> io_evt);
>> +				account_async_io_state(pp, rc);
>> +				return;
>> +			}
>> +		}
>> +	}
>> +}
>> +
>> +static void process_async_ios_event(int timeout_secs, char *dev)
>> +{
>> +	struct io_event events[CONCUR_NR_EVENT];
>> +	int		i, n;
>> +	struct timespec	timeout = { .tv_sec = timeout_secs };
>> +
>> +	errno = 0;
>> +	n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events,
>> &timeout);
>> +	if (n < 0) {
>> +		io_err_stat_log(3, "%s: async io events returned %d
>> (errno=%s)",
>> +				dev, n, strerror(errno));
>> +	} else if (n == 0L) {
>> +		handle_async_io_timeout();
>> +	} else {
>> +		for (i = 0; i < n; i++)
>> +			handle_async_io_done_event(&events[i]);
>> +	}
>> +}
> 
> I'm not sure this is correct. You handle timeouts only if io_getevents
> returns 0. But it because not all IOs have necessarily the same start
> time, it could be that some have timed out while others haven't. Or am
> I overlooking something? At least this deserves some comments.
> 

handle_async_io_timeout(), the function name is ambigous. I will rename it.
In fact, it only cancel the timeout IO, shown in the try_to_cancel_timeout_io(struct dio_ctx *ct, struct...).

+static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct
timespec *t,
+		char *dev)
+{
+	struct timespec	difftime;
+	struct io_event	event;
+	int		rc = PATH_UNCHECKED;
+	int		r;
+
+	timespecsub(t, &ct->io_starttime, &difftime);
+	if (difftime.tv_sec > IOTIMEOUT_SEC) {
+		struct iocb *ios[1] = { &ct->io };
+
+		io_err_stat_log(5, "%s: abort check on timeout",
dev);
+		r = io_cancel(ioctx, ios[0], &event);

>> +
>> +static void service_paths(void)
>> +{
>> +	struct io_err_stat_path *pp;
>> +	int i;
>> +
>> +	pthread_mutex_lock(&paths->mutex);
>> +	vector_foreach_slot(paths->pathvec, pp, i) {
>> +		send_batch_async_ios(pp);
>> +		process_async_ios_event(IOTIMEOUT_SEC, pp->devname);
>> +		poll_io_err_stat(vecs, pp);
>> +	}
>> +	pthread_mutex_unlock(&paths->mutex);
>> +}
> 
> There's another problem here. You wait up to IOTIMEOUT_SEC seconds for
> each path. This will loop take a minute for every path that is down.
> You'll end up waiting much too long for the late paths in the loop. You
> should rather set an end time and wait only the difference END-NOW()
> So,
process_async_ios_event(IOTIMEOUT_SEC, pp->devname)  --->
process_async_ios_event(10ms, pp->devname) ?

If io_getevents return 0 record of io_events within 10ms in the current round,
io_getevents will called  in the next 100ms round.

>>
>> +static void *io_err_stat_loop(void *data)
>> +{
>> +	vecs = (struct vectors *)data;
>> +	pthread_cleanup_push(rcu_unregister, NULL);
>> +	rcu_register_thread();
>> +
>> +	mlockall(MCL_CURRENT | MCL_FUTURE);
>> +	while (1) {
>> +		service_paths();
>> +		usleep(100000);
>> +	}
>> +
>> +	pthread_cleanup_pop(1);
>> +	return NULL;
>> +}
>> +
>> +int start_io_err_stat_thread(void *data)
>> +{
>> +	if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) {
>> +		io_err_stat_log(4, "io_setup failed");
>> +		return 1;
>> +	}
>> +	paths = alloc_pathvec();
>> +	if (!paths)
>> +		goto destroy_ctx;
>> +
>> +	if (pthread_create(&io_err_stat_thr, &io_err_stat_attr,
>> +				io_err_stat_loop, data)) {
>> +		io_err_stat_log(0, "cannot create io_error statistic
>> thread");
>> +		goto out_free;
>> +	}
>> +	io_err_stat_log(3, "thread started");
>> +	return 0;
>> +
>> +out_free:
>> +	free_io_err_pathvec(paths);
>> +destroy_ctx:
>> +	io_destroy(ioctx);
>> +	io_err_stat_log(0, "failed to start io_error statistic
>> thread");
>> +	return 1;
>> +}
>> +
>> +void stop_io_err_stat_thread(void)
>> +{
>> +	pthread_cancel(io_err_stat_thr);
>> +	pthread_kill(io_err_stat_thr, SIGUSR2);
>> +	free_io_err_pathvec(paths);
>> +	io_destroy(ioctx);
>> +}
>> diff --git a/libmultipath/io_err_stat.h b/libmultipath/io_err_stat.h
>> new file mode 100644
>> index 00000000..97685617
>> --- /dev/null
>> +++ b/libmultipath/io_err_stat.h
>> @@ -0,0 +1,15 @@
>> +#ifndef _IO_ERR_STAT_H
>> +#define _IO_ERR_STAT_H
>> +
>> +#include "vector.h"
>> +#include "lock.h"
>> +
>> +
>> +extern pthread_attr_t io_err_stat_attr;
>> +
>> +int start_io_err_stat_thread(void *data);
>> +void stop_io_err_stat_thread(void);
>> +int io_err_stat_handle_pathfail(struct path *path);
>> +int hit_io_err_recovery_time(struct path *pp);
>> +
>> +#endif /* _IO_ERR_STAT_H */
>> diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c
>> index 175fbe11..9d2c3c09 100644
>> --- a/libmultipath/propsel.c
>> +++ b/libmultipath/propsel.c
>> @@ -731,6 +731,7 @@ out:
>>  	return 0;
>>  
>>  }
>> +
>>  int select_san_path_err_threshold(struct config *conf, struct
>> multipath *mp)
>>  {
>>  	char *origin, buff[12];
>> @@ -761,6 +762,7 @@ out:
>>  	return 0;
>>  
>>  }
>> +
>>  int select_san_path_err_recovery_time(struct config *conf, struct
>> multipath *mp)
>>  {
>>  	char *origin, buff[12];
>> @@ -776,6 +778,57 @@ out:
>>  	return 0;
>>  
>>  }
>> +
>> +int select_path_io_err_sample_time(struct config *conf, struct
>> multipath *mp)
>> +{
>> +	char *origin, buff[12];
>> +
>> +	mp_set_mpe(path_io_err_sample_time);
>> +	mp_set_ovr(path_io_err_sample_time);
>> +	mp_set_hwe(path_io_err_sample_time);
>> +	mp_set_conf(path_io_err_sample_time);
>> +	mp_set_default(path_io_err_sample_time, DEFAULT_ERR_CHECKS);
>> +out:
>> +	print_off_int_undef(buff, 12, &mp->path_io_err_sample_time);
>> +	condlog(3, "%s: path_io_err_sample_time = %s %s", mp->alias, 
>> buff,
>> +			origin);
>> +	return 0;
>> +}
>> +
>> +int select_path_io_err_rate_threshold(struct config *conf, struct
>> multipath *mp)
>> +{
>> +	char *origin, buff[12];
>> +
>> +	mp_set_mpe(path_io_err_rate_threshold);
>> +	mp_set_ovr(path_io_err_rate_threshold);
>> +	mp_set_hwe(path_io_err_rate_threshold);
>> +	mp_set_conf(path_io_err_rate_threshold);
>> +	mp_set_default(path_io_err_rate_threshold,
>> DEFAULT_ERR_CHECKS);
>> +out:
>> +	print_off_int_undef(buff, 12, &mp-
>>> path_io_err_rate_threshold);
>> +	condlog(3, "%s: path_io_err_rate_threshold = %s %s", mp-
>>> alias, buff,
>> +			origin);
>> +	return 0;
>> +
>> +}
>> +
>> +int select_path_io_err_recovery_time(struct config *conf, struct
>> multipath *mp)
>> +{
>> +	char *origin, buff[12];
>> +
>> +	mp_set_mpe(path_io_err_recovery_time);
>> +	mp_set_ovr(path_io_err_recovery_time);
>> +	mp_set_hwe(path_io_err_recovery_time);
>> +	mp_set_conf(path_io_err_recovery_time);
>> +	mp_set_default(path_io_err_recovery_time,
>> DEFAULT_ERR_CHECKS);
>> +out:
>> +	print_off_int_undef(buff, 12, &mp-
>>> path_io_err_recovery_time);
>> +	condlog(3, "%s: path_io_err_recovery_time = %s %s", mp-
>>> alias, buff,
>> +			origin);
>> +	return 0;
>> +
>> +}
>> +
>>  int select_skip_kpartx (struct config *conf, struct multipath * mp)
>>  {
>>  	char *origin;
>> diff --git a/libmultipath/propsel.h b/libmultipath/propsel.h
>> index f8e96d85..1b2b5714 100644
>> --- a/libmultipath/propsel.h
>> +++ b/libmultipath/propsel.h
>> @@ -28,6 +28,9 @@ int select_max_sectors_kb (struct config *conf,
>> struct multipath * mp);
>>  int select_san_path_err_forget_rate(struct config *conf, struct
>> multipath *mp);
>>  int select_san_path_err_threshold(struct config *conf, struct
>> multipath *mp);
>>  int select_san_path_err_recovery_time(struct config *conf, struct
>> multipath *mp);
>> +int select_path_io_err_sample_time(struct config *conf, struct
>> multipath *mp);
>> +int select_path_io_err_rate_threshold(struct config *conf, struct
>> multipath *mp);
>> +int select_path_io_err_recovery_time(struct config *conf, struct
>> multipath *mp);
>>  void reconcile_features_with_options(const char *id, char
>> **features,
>>  				     int* no_path_retry,
>>  				     int *retain_hwhandler);
>> diff --git a/libmultipath/structs.h b/libmultipath/structs.h
>> index 8ea984d9..1ab8cb9b 100644
>> --- a/libmultipath/structs.h
>> +++ b/libmultipath/structs.h
>> @@ -235,6 +235,10 @@ struct path {
>>  	time_t dis_reinstate_time;
>>  	int disable_reinstate;
>>  	int san_path_err_forget_rate;
>> +	time_t io_err_dis_reinstate_time;
>> +	int io_err_disable_reinstate;
>> +	int io_err_pathfail_cnt;
>> +	int io_err_pathfail_starttime;
>>  	/* configlet pointers */
>>  	struct hwentry * hwe;
>>  };
>> @@ -269,6 +273,9 @@ struct multipath {
>>  	int san_path_err_threshold;
>>  	int san_path_err_forget_rate;
>>  	int san_path_err_recovery_time;
>> +	int path_io_err_sample_time;
>> +	int path_io_err_rate_threshold;
>> +	int path_io_err_recovery_time;
>>  	int skip_kpartx;
>>  	int max_sectors_kb;
>>  	int force_readonly;
>> diff --git a/libmultipath/uevent.c b/libmultipath/uevent.c
>> index eb44da56..56de1320 100644
>> --- a/libmultipath/uevent.c
>> +++ b/libmultipath/uevent.c
>> @@ -904,8 +904,8 @@ char *uevent_get_dm_name(struct uevent *uev)
>>  	int i;
>>  
>>  	for (i = 0; uev->envp[i] != NULL; i++) {
>> -		if (!strncmp(uev->envp[i], "DM_NAME", 6) &&
>> -		    strlen(uev->envp[i]) > 7) {
>> +		if (!strncmp(uev->envp[i], "DM_NAME", 7) &&
>> +		    strlen(uev->envp[i]) > 8) {
>>  			p = MALLOC(strlen(uev->envp[i] + 8) + 1);
>>  			strcpy(p, uev->envp[i] + 8);
>>  			break;
> 
> Thanks for spotting. Please post this as a separate, bugfix patch.
> 

OK.

> 
>> @@ -913,3 +913,35 @@ char *uevent_get_dm_name(struct uevent *uev)
>>  	}
>>  	return p;
>>  }
>> +
>> +char *uevent_get_dm_path(struct uevent *uev)
>> +{
>> +	char *p = NULL;
>> +	int i;
>> +
>> +	for (i = 0; uev->envp[i] != NULL; i++) {
>> +		if (!strncmp(uev->envp[i], "DM_PATH", 7) &&
>> +		    strlen(uev->envp[i]) > 8) {
>> +			p = MALLOC(strlen(uev->envp[i] + 8) + 1);
>> +			strcpy(p, uev->envp[i] + 8);
>> +			break;
>> +		}
>> +	}
>> +	return p;
>> +}
>> +
>> +char *uevent_get_dm_action(struct uevent *uev)
>> +{
>> +	char *p = NULL;
>> +	int i;
>> +
>> +	for (i = 0; uev->envp[i] != NULL; i++) {
>> +		if (!strncmp(uev->envp[i], "DM_ACTION", 9) &&
>> +		    strlen(uev->envp[i]) > 10) {
>> +			p = MALLOC(strlen(uev->envp[i] + 10) + 1);
>> +			strcpy(p, uev->envp[i] + 10);
>> +			break;
>> +		}
>> +	}
>> +	return p;
>> +}
>> diff --git a/libmultipath/uevent.h b/libmultipath/uevent.h
>> index 61a42071..6f5af0af 100644
>> --- a/libmultipath/uevent.h
>> +++ b/libmultipath/uevent.h
>> @@ -37,5 +37,7 @@ int uevent_get_major(struct uevent *uev);
>>  int uevent_get_minor(struct uevent *uev);
>>  int uevent_get_disk_ro(struct uevent *uev);
>>  char *uevent_get_dm_name(struct uevent *uev);
>> +char *uevent_get_dm_path(struct uevent *uev);
>> +char *uevent_get_dm_action(struct uevent *uev);
>>  
>>  #endif /* _UEVENT_H */
>> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
>> index d9ac279f..c4406128 100644
>> --- a/multipath/multipath.conf.5
>> +++ b/multipath/multipath.conf.5
>> @@ -849,6 +849,49 @@ The default is: \fBno\fR
>>  .
>>  .
>>  .TP
>> +.B path_io_err_sample_time
>> +One of the three parameters of supporting path check based on
>> accounting IO
>> +error such as intermittent error. If it is set to a value greater
>> than 0,
>> +when a path fail event occurs due to an IO error, multipathd will
>> enqueue this
>> +path into a queue of which members are sent a couple of continuous
>> direct
>> +reading aio at a fixed sample rate of 10HZ. The IO accounting
>> process for a
>> +path will last for \fIpath_io_err_sample_time\fR. If the rate of IO
>> error on
>> +a particular path is greater than the
>> \fIpath_io_err_rate_threshold\fR, then
>> +the path will not reinstate for \fIpath_io_err_rate_threshold\fR
>> seconds unless
>> +there is only one active path.
>> +.RS
>> +.TP
>> +The default is: \fBno\fR
>> +.RE
>> +.
>> +.
>> +.TP
>> +.B path_io_err_rate_threshold
>> +The error rate threshold as a permillage (1/1000). One of the three
>> parameters
>> +of supporting path check based on accounting IO error such as
>> intermittent
>> +error. Refer to \fIpath_io_err_sample_time\fR. If the rate of IO
>> errors on a
>> +particular path is greater than this parameter, then the path will
>> not
>> +reinstate for \fIpath_io_err_rate_threshold\fR seconds unless there
>> is only one
>> +active path.
>> +.RS
>> +.TP
>> +The default is: \fBno\fR
>> +.RE
>> +.
>> +.
>> +.TP
>> +.B path_io_err_recovery_time
>> +One of the three parameters of supporting path check based on
>> accounting IO
>> +error such as intermittent error. Refer to
>> \fIpath_io_err_sample_time\fR. If
>> +this parameter is set to a positive value, the path which has many
>> error will
>> +not reinsate till \fIpath_io_err_recovery_time\fR seconds.
>> +.RS
>> +.TP
>> +The default is: \fBno\fR
>> +.RE
>> +.
>> +.
>> +.TP
>>  .B delay_watch_checks
>>  If set to a value greater than 0, multipathd will watch paths that
>> have
>>  recently become valid for this many checks. If they fail again while
>> they are
>> diff --git a/multipathd/main.c b/multipathd/main.c
>> index 4be2c579..5758cdd1 100644
>> --- a/multipathd/main.c
>> +++ b/multipathd/main.c
>> @@ -84,6 +84,7 @@ int uxsock_timeout;
>>  #include "cli_handlers.h"
>>  #include "lock.h"
>>  #include "waiter.h"
>> +#include "io_err_stat.h"
>>  #include "wwids.h"
>>  #include "../third-party/valgrind/drd.h"
>>  
>> @@ -1050,6 +1051,41 @@ out:
>>  }
>>  
>>  static int
>> +uev_pathfail_check(struct uevent *uev, struct vectors *vecs)
>> +{
>> +	char *action = NULL, *devt = NULL;
>> +	struct path *pp;
>> +	int r;
>> +
>> +	action = uevent_get_dm_action(uev);
>> +	if (!action)
>> +		return 1;
>> +	if (strncmp(action, "PATH_FAILED", 11))
>> +		goto out;
>> +	devt = uevent_get_dm_path(uev);
>> +	if (!devt) {
>> +		condlog(3, "%s: No DM_PATH in uevent", uev->kernel);
>> +		goto out;
>> +	}
>> +
>> +	pthread_cleanup_push(cleanup_lock, &vecs->lock);
>> +	lock(&vecs->lock);
>> +	pthread_testcancel();
>> +	pp = find_path_by_devt(vecs->pathvec, devt);
>> +	r = io_err_stat_handle_pathfail(pp);
>> +	lock_cleanup_pop(vecs->lock);
>> +
>> +	if (r)
>> +		condlog(3, "io_err_stat: fails to enqueue %s", pp-
>>> dev);
>> +	FREE(devt);
>> +	FREE(action);
>> +	return 0;
>> +out:
>> +	FREE(action);
>> +	return 1;
>> +}
>> +
>> +static int
>>  map_discovery (struct vectors * vecs)
>>  {
>>  	struct multipath * mpp;
>> @@ -1134,6 +1170,7 @@ uev_trigger (struct uevent * uev, void *
>> trigger_data)
>>  	if (!strncmp(uev->kernel, "dm-", 3)) {
>>  		if (!strncmp(uev->action, "change", 6)) {
>>  			r = uev_add_map(uev, vecs);
>> +			uev_pathfail_check(uev, vecs);
>>  			goto out;
>>  		}
>>  		if (!strncmp(uev->action, "remove", 6)) {
> 
> It's interesting that you have chosen to act on kernel PATH_FAILED
> events here. The existing multipath-tools code doesn't do that, it
> relies on its internal state information (pathinfo, checkers) instead.
> I am not saying that what you're doing is wrong, but it deserves some
> explanation. 
> 
> Regards
> Martin
> 

the kernel PATH_FAILED events will only be issued when path IO error occours.
I choose this as the entry of the intermittent IO error accounting to
avoid the impact of path removing and adding event. For path removing
and adding event (such uev_add_path, uev_remove_path), sending a couple of continous
IO will result in continuous IO error response.

Comments will be added into the next patch.

Best wishes

Guan Junxiong

>> @@ -1553,6 +1590,7 @@ static int check_path_reinstate_state(struct
>> path * pp) {
>>  		condlog(2, "%s : hit error threshold. Delaying path
>> reinstatement", pp->dev);
>>  		pp->dis_reinstate_time = curr_time.tv_sec;
>>  		pp->disable_reinstate = 1;
>> +
>>  		return 1;
>>  	} else {
>>  		return 0;
>> @@ -1684,6 +1722,16 @@ check_path (struct vectors * vecs, struct path
>> * pp, int ticks)
>>  		return 1;
>>  	}
>>  
>> +	if (pp->io_err_disable_reinstate &&
>> hit_io_err_recovery_time(pp)) {
>> +		pp->state = PATH_DELAYED;
>> +		/*
>> +		 * to reschedule as soon as possible,so that this
>> path can
>> +		 * be recoverd in time
>> +		 */
>> +		pp->tick = 1;
>> +		return 1;
>> +	}
>> +
>>  	if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
>>  	     pp->wait_checks > 0) {
>>  		if (pp->mpp->nr_active > 0) {
>> @@ -2377,6 +2425,7 @@ child (void * param)
>>  	setup_thread_attr(&misc_attr, 64 * 1024, 0);
>>  	setup_thread_attr(&uevent_attr, DEFAULT_UEVENT_STACKSIZE *
>> 1024, 0);
>>  	setup_thread_attr(&waiter_attr, 32 * 1024, 1);
>> +	setup_thread_attr(&io_err_stat_attr, 32 * 1024, 1);
>>  
>>  	if (logsink == 1) {
>>  		setup_thread_attr(&log_attr, 64 * 1024, 0);
>> @@ -2499,6 +2548,10 @@ child (void * param)
>>  	/*
>>  	 * start threads
>>  	 */
>> +	rc = start_io_err_stat_thread(vecs);
>> +	if (rc)
>> +		goto failed;
>> +
>>  	if ((rc = pthread_create(&check_thr, &misc_attr,
>> checkerloop, vecs))) {
>>  		condlog(0,"failed to create checker loop thread:
>> %d", rc);
>>  		goto failed;
>> @@ -2548,6 +2601,8 @@ child (void * param)
>>  	remove_maps_and_stop_waiters(vecs);
>>  	unlock(&vecs->lock);
>>  
>> +	stop_io_err_stat_thread();
>> +
>>  	pthread_cancel(check_thr);
>>  	pthread_cancel(uevent_thr);
>>  	pthread_cancel(uxlsnr_thr);
>> @@ -2593,6 +2648,7 @@ child (void * param)
>>  	udev_unref(udev);
>>  	udev = NULL;
>>  	pthread_attr_destroy(&waiter_attr);
>> +	pthread_attr_destroy(&io_err_stat_attr);
>>  #ifdef _DEBUG_
>>  	dbg_free_final(NULL);
>>  #endif
>