diff --git a/libmultipath/config.c b/libmultipath/config.c index 15ddbd8..19adb97 100644 --- a/libmultipath/config.c +++ b/libmultipath/config.c @@ -348,6 +348,9 @@ merge_hwe (struct hwentry * dst, struct hwentry * src) merge_num(delay_wait_checks); merge_num(skip_kpartx); merge_num(max_sectors_kb); + merge_num(san_path_err_threshold); + merge_num(san_path_err_threshold_window); + merge_num(san_path_err_recovery_time); /* * Make sure features is consistent with diff --git a/libmultipath/config.h b/libmultipath/config.h index 9670020..2985958 100644 --- a/libmultipath/config.h +++ b/libmultipath/config.h @@ -65,6 +65,9 @@ struct hwentry { int deferred_remove; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_threshold_window; + int san_path_err_recovery_time; int skip_kpartx; int max_sectors_kb; char * bl_product; @@ -93,6 +96,9 @@ struct mpentry { int deferred_remove; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_threshold_window; + int san_path_err_recovery_time; int skip_kpartx; int max_sectors_kb; uid_t uid; @@ -138,6 +144,9 @@ struct config { int processed_main_config; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_threshold_window; + int san_path_err_recovery_time; int uxsock_timeout; int strict_timing; int retrigger_tries; diff --git a/libmultipath/configure.c b/libmultipath/configure.c index a0fcad9..0f50826 100644 --- a/libmultipath/configure.c +++ b/libmultipath/configure.c @@ -294,6 +294,9 @@ int setup_map(struct multipath *mpp, char *params, int params_size) select_deferred_remove(conf, mpp); select_delay_watch_checks(conf, mpp); select_delay_wait_checks(conf, mpp); + select_san_path_err_threshold(conf, mpp); + select_san_path_err_threshold_window(conf, mpp); + select_san_path_err_recovery_time(conf, mpp); select_skip_kpartx(conf, mpp); select_max_sectors_kb(conf, mpp); diff --git a/libmultipath/defaults.h b/libmultipath/defaults.h index b9b0a37..9e8059c 100644 --- a/libmultipath/defaults.h +++ b/libmultipath/defaults.h @@ -24,6 +24,7 @@ #define DEFAULT_DETECT_PRIO DETECT_PRIO_ON #define DEFAULT_DEFERRED_REMOVE DEFERRED_REMOVE_OFF #define DEFAULT_DELAY_CHECKS DELAY_CHECKS_OFF +#define DEFAULT_ERR_CHECKS ERR_CHECKS_OFF #define DEFAULT_UEVENT_STACKSIZE 256 #define DEFAULT_RETRIGGER_DELAY 10 #define DEFAULT_RETRIGGER_TRIES 3 diff --git a/libmultipath/dict.c b/libmultipath/dict.c index dc21846..a5689bd 100644 --- a/libmultipath/dict.c +++ b/libmultipath/dict.c @@ -1074,6 +1074,72 @@ declare_hw_snprint(delay_wait_checks, print_delay_checks) declare_mp_handler(delay_wait_checks, set_delay_checks) declare_mp_snprint(delay_wait_checks, print_delay_checks) + +static int +set_path_err_info(vector strvec, void *ptr) +{ + int *int_ptr = (int *)ptr; + char * buff; + + buff = set_value(strvec); + if (!buff) + return 1; + + if (!strcmp(buff, "no") || !strcmp(buff, "0")) + *int_ptr = ERR_CHECKS_OFF; + else if ((*int_ptr = atoi(buff)) < 1) + *int_ptr = ERR_CHECKS_UNDEF; + + FREE(buff); + return 0; +} + +int +print_path_err_info(char * buff, int len, void *ptr) +{ + int *int_ptr = (int *)ptr; + + switch(*int_ptr) { + case ERR_CHECKS_UNDEF: + return 0; + case ERR_CHECKS_OFF: + return snprintf(buff, len, "\"off\""); + default: + return snprintf(buff, len, "%i", *int_ptr); + } +} + + + + + +declare_def_handler(san_path_err_threshold, set_path_err_info) +declare_def_snprint(san_path_err_threshold, print_path_err_info) +declare_ovr_handler(san_path_err_threshold, set_path_err_info) +declare_ovr_snprint(san_path_err_threshold, print_path_err_info) +declare_hw_handler(san_path_err_threshold, set_path_err_info) +declare_hw_snprint(san_path_err_threshold, print_path_err_info) +declare_mp_handler(san_path_err_threshold, set_path_err_info) +declare_mp_snprint(san_path_err_threshold, print_path_err_info) + +declare_def_handler(san_path_err_threshold_window, set_path_err_info) +declare_def_snprint(san_path_err_threshold_window, print_path_err_info) +declare_ovr_handler(san_path_err_threshold_window, set_path_err_info) +declare_ovr_snprint(san_path_err_threshold_window, print_path_err_info) +declare_hw_handler(san_path_err_threshold_window, set_path_err_info) +declare_hw_snprint(san_path_err_threshold_window, print_path_err_info) +declare_mp_handler(san_path_err_threshold_window, set_path_err_info) +declare_mp_snprint(san_path_err_threshold_window, print_path_err_info) + + +declare_def_handler(san_path_err_recovery_time, set_path_err_info) +declare_def_snprint(san_path_err_recovery_time, print_path_err_info) +declare_ovr_handler(san_path_err_recovery_time, set_path_err_info) +declare_ovr_snprint(san_path_err_recovery_time, print_path_err_info) +declare_hw_handler(san_path_err_recovery_time, set_path_err_info) +declare_hw_snprint(san_path_err_recovery_time, print_path_err_info) +declare_mp_handler(san_path_err_recovery_time, set_path_err_info) +declare_mp_snprint(san_path_err_recovery_time, print_path_err_info) static int def_uxsock_timeout_handler(struct config *conf, vector strvec) { @@ -1404,6 +1470,10 @@ init_keywords(vector keywords) install_keyword("config_dir", &def_config_dir_handler, &snprint_def_config_dir); install_keyword("delay_watch_checks", &def_delay_watch_checks_handler, &snprint_def_delay_watch_checks); install_keyword("delay_wait_checks", &def_delay_wait_checks_handler, &snprint_def_delay_wait_checks); + install_keyword("san_path_err_threshold", &def_san_path_err_threshold_handler, &snprint_def_san_path_err_threshold); + install_keyword("san_path_err_threshold_window", &def_san_path_err_threshold_window_handler, &snprint_def_san_path_err_threshold_window); + install_keyword("san_path_err_recovery_time", &def_san_path_err_recovery_time_handler, &snprint_def_san_path_err_recovery_time); + install_keyword("find_multipaths", &def_find_multipaths_handler, &snprint_def_find_multipaths); install_keyword("uxsock_timeout", &def_uxsock_timeout_handler, &snprint_def_uxsock_timeout); install_keyword("retrigger_tries", &def_retrigger_tries_handler, &snprint_def_retrigger_tries); @@ -1486,6 +1556,9 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &hw_deferred_remove_handler, &snprint_hw_deferred_remove); install_keyword("delay_watch_checks", &hw_delay_watch_checks_handler, &snprint_hw_delay_watch_checks); install_keyword("delay_wait_checks", &hw_delay_wait_checks_handler, &snprint_hw_delay_wait_checks); + install_keyword("san_path_err_threshold", &hw_san_path_err_threshold_handler, &snprint_hw_san_path_err_threshold); + install_keyword("san_path_err_threshold_window", &hw_san_path_err_threshold_window_handler, &snprint_hw_san_path_err_threshold_window); + install_keyword("san_path_err_recovery_time", &hw_san_path_err_recovery_time_handler, &snprint_hw_san_path_err_recovery_time); install_keyword("skip_kpartx", &hw_skip_kpartx_handler, &snprint_hw_skip_kpartx); install_keyword("max_sectors_kb", &hw_max_sectors_kb_handler, &snprint_hw_max_sectors_kb); install_sublevel_end(); @@ -1515,6 +1588,10 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &ovr_deferred_remove_handler, &snprint_ovr_deferred_remove); install_keyword("delay_watch_checks", &ovr_delay_watch_checks_handler, &snprint_ovr_delay_watch_checks); install_keyword("delay_wait_checks", &ovr_delay_wait_checks_handler, &snprint_ovr_delay_wait_checks); + install_keyword("san_path_err_threshold", &ovr_san_path_err_threshold_handler, &snprint_ovr_san_path_err_threshold); + install_keyword("san_path_err_threshold_window", &ovr_san_path_err_threshold_window_handler, &snprint_ovr_san_path_err_threshold_window); + install_keyword("san_path_err_recovery_time", &ovr_san_path_err_recovery_time_handler, &snprint_ovr_san_path_err_recovery_time); + install_keyword("skip_kpartx", &ovr_skip_kpartx_handler, &snprint_ovr_skip_kpartx); install_keyword("max_sectors_kb", &ovr_max_sectors_kb_handler, &snprint_ovr_max_sectors_kb); @@ -1543,6 +1620,9 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &mp_deferred_remove_handler, &snprint_mp_deferred_remove); install_keyword("delay_watch_checks", &mp_delay_watch_checks_handler, &snprint_mp_delay_watch_checks); install_keyword("delay_wait_checks", &mp_delay_wait_checks_handler, &snprint_mp_delay_wait_checks); + install_keyword("san_path_err_threshold", &mp_san_path_err_threshold_handler, &snprint_mp_san_path_err_threshold); + install_keyword("san_path_err_threshold_window", &mp_san_path_err_threshold_window_handler, &snprint_mp_san_path_err_threshold_window); + install_keyword("san_path_err_recovery_time", &mp_san_path_err_recovery_time_handler, &snprint_mp_san_path_err_recovery_time); install_keyword("skip_kpartx", &mp_skip_kpartx_handler, &snprint_mp_skip_kpartx); install_keyword("max_sectors_kb", &mp_max_sectors_kb_handler, &snprint_mp_max_sectors_kb); install_sublevel_end(); diff --git a/libmultipath/dict.h b/libmultipath/dict.h index 4cd03c5..adaa9f1 100644 --- a/libmultipath/dict.h +++ b/libmultipath/dict.h @@ -15,5 +15,6 @@ int print_fast_io_fail(char * buff, int len, void *ptr); int print_dev_loss(char * buff, int len, void *ptr); int print_reservation_key(char * buff, int len, void * ptr); int print_delay_checks(char * buff, int len, void *ptr); +int print_path_err_info(char * buff, int len, void *ptr); #endif /* _DICT_H */ diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c index c0bc616..f4ca378 100644 --- a/libmultipath/propsel.c +++ b/libmultipath/propsel.c @@ -643,7 +643,51 @@ out: return 0; } +int select_san_path_err_threshold(struct config *conf, struct multipath *mp) +{ + char *origin, buff[12]; + + mp_set_mpe(san_path_err_threshold); + mp_set_ovr(san_path_err_threshold); + mp_set_hwe(san_path_err_threshold); + mp_set_conf(san_path_err_threshold); + mp_set_default(san_path_err_threshold, DEFAULT_ERR_CHECKS); +out: + print_path_err_info(buff, 12, &mp->san_path_err_threshold); + condlog(3, "%s: san_path_err_threshold = %s %s", mp->alias, buff, origin); + return 0; +} + +int select_san_path_err_threshold_window(struct config *conf, struct multipath *mp) +{ + char *origin, buff[12]; + + mp_set_mpe(san_path_err_threshold_window); + mp_set_ovr(san_path_err_threshold_window); + mp_set_hwe(san_path_err_threshold_window); + mp_set_conf(san_path_err_threshold_window); + mp_set_default(san_path_err_threshold_window, DEFAULT_ERR_CHECKS); +out: + print_path_err_info(buff, 12, &mp->san_path_err_threshold_window); + condlog(3, "%s: san_path_err_threshold_window = %s %s", mp->alias, buff, origin); + return 0; + +} +int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp) +{ + char *origin, buff[12]; + mp_set_mpe(san_path_err_recovery_time); + mp_set_ovr(san_path_err_recovery_time); + mp_set_hwe(san_path_err_recovery_time); + mp_set_conf(san_path_err_recovery_time); + mp_set_default(san_path_err_recovery_time, DEFAULT_ERR_CHECKS); +out: + print_path_err_info(buff, 12, &mp->san_path_err_recovery_time); + condlog(3, "%s: san_path_err_recovery_time = %s %s", mp->alias, buff, origin); + return 0; + +} int select_skip_kpartx (struct config *conf, struct multipath * mp) { char *origin; diff --git a/libmultipath/propsel.h b/libmultipath/propsel.h index ad98fa5..88b5840 100644 --- a/libmultipath/propsel.h +++ b/libmultipath/propsel.h @@ -24,3 +24,9 @@ int select_delay_watch_checks (struct config *conf, struct multipath * mp); int select_delay_wait_checks (struct config *conf, struct multipath * mp); int select_skip_kpartx (struct config *conf, struct multipath * mp); int select_max_sectors_kb (struct config *conf, struct multipath * mp); +int select_san_path_err_threshold_window(struct config *conf, struct multipath *mp); +int select_san_path_err_threshold(struct config *conf, struct multipath *mp); +int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp); + + + diff --git a/libmultipath/structs.h b/libmultipath/structs.h index 396f69d..8b7a803 100644 --- a/libmultipath/structs.h +++ b/libmultipath/structs.h @@ -156,6 +156,10 @@ enum delay_checks_states { DELAY_CHECKS_OFF = -1, DELAY_CHECKS_UNDEF = 0, }; +enum err_checks_states { + ERR_CHECKS_OFF = -1, + ERR_CHECKS_UNDEF = 0, +}; enum initialized_states { INIT_FAILED, @@ -223,7 +227,10 @@ struct path { int initialized; int retriggers; int wwid_changed; - + unsigned int path_failures; + time_t failure_start_time; + time_t dis_reinstante_time; + int disable_reinstate; /* configlet pointers */ struct hwentry * hwe; }; @@ -255,6 +262,9 @@ struct multipath { int deferred_remove; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_threshold_window; + int san_path_err_recovery_time; int skip_kpartx; int max_sectors_kb; unsigned int dev_loss; diff --git a/libmultipath/structs_vec.c b/libmultipath/structs_vec.c index 22be8e0..bf84b17 100644 --- a/libmultipath/structs_vec.c +++ b/libmultipath/structs_vec.c @@ -546,6 +546,7 @@ int update_multipath (struct vectors *vecs, char *mapname, int reset) struct pathgroup *pgp; struct path *pp; int i, j; + struct timespec start_time; mpp = find_mp_by_alias(vecs->mpvec, mapname); @@ -570,6 +571,15 @@ int update_multipath (struct vectors *vecs, char *mapname, int reset) int oldstate = pp->state; condlog(2, "%s: mark as failed", pp->dev); mpp->stat_path_failures++; + /*Captured the time when we see the first failure on the path*/ + if(pp->path_failures == 0) { + if (clock_gettime(CLOCK_MONOTONIC, &start_time) != 0) + start_time.tv_sec = 0; + pp->failure_start_time = start_time.tv_sec; + + } + /*Increment the number of path failures*/ + pp->path_failures++; pp->state = PATH_DOWN; if (oldstate == PATH_UP || oldstate == PATH_GHOST) diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5 index 36589f5..7dfd48a 100644 --- a/multipath/multipath.conf.5 +++ b/multipath/multipath.conf.5 @@ -751,6 +751,46 @@ The default is: \fB/etc/multipath/conf.d/\fR . . .TP +.B san_path_err_threshold +If set to a value greater than 0, multipathd will watch paths and check how many +times a path has been failed due to errors.If the number of failures on a particular +path is greater then the san_path_err_threshold then the path will not reinstante +till san_path_err_recovery_time.These path failures should occur within a +san_path_err_threshold_window time frame, if not we will consider the path is good enough +to reinstantate. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP +.B san_path_err_threshold_window +If set to a value greater than 0, multipathd will check whether the path failures +has exceeded the san_path_err_threshold within this time frame i.e +san_path_err_threshold_window . If so we will not reinstante the path till +san_path_err_recovery_time. +san_path_err_threshold_window value should be in secs. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP +.B san_path_err_recovery_time +If set to a value greater than 0, multipathd will make sure that when path failures +has exceeded the san_path_err_threshold within san_path_err_threshold_window then the path +will be placed in failed state for san_path_err_recovery_time duration.Once san_path_err_recovery_time +has timeout we will reinstante the failed path . +san_path_err_recovery_time value should be in secs. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP .B delay_watch_checks If set to a value greater than 0, multipathd will watch paths that have recently become valid for this many checks. If they fail again while they are @@ -1015,6 +1055,12 @@ are taken from the \fIdefaults\fR or \fIdevices\fR section: .TP .B deferred_remove .TP +.B san_path_err_threshold +.TP +.B san_path_err_threshold_window +.TP +.B san_path_err_recovery_time +.TP .B delay_watch_checks .TP .B delay_wait_checks @@ -1128,6 +1174,12 @@ section: .TP .B deferred_remove .TP +.B san_path_err_threshold +.TP +.B san_path_err_threshold_window +.TP +.B san_path_err_recovery_time +.TP .B delay_watch_checks .TP .B delay_wait_checks @@ -1192,6 +1244,12 @@ the values are taken from the \fIdevices\fR or \fIdefaults\fR sections: .TP .B deferred_remove .TP +.B san_path_err_threshold +.TP +.B san_path_err_threshold_window +.TP +.B san_path_err_recovery_time +.TP .B delay_watch_checks .TP .B delay_wait_checks diff --git a/multipathd/main.c b/multipathd/main.c index adc3258..facfc03 100644 --- a/multipathd/main.c +++ b/multipathd/main.c @@ -1486,7 +1486,54 @@ void repair_path(struct path * pp) checker_repair(&pp->checker); LOG_MSG(1, checker_message(&pp->checker)); } +static int check_path_validity_err( struct path * pp){ + struct timespec start_time; + int disable_reinstate = 0; + + if (clock_gettime(CLOCK_MONOTONIC, &start_time) != 0) + start_time.tv_sec = 0; + + /*If number of path failures are more then the san_path_err_threshold*/ + if((pp->mpp->san_path_err_threshold > 0)&& (pp->path_failures > pp->mpp->san_path_err_threshold)){ + condlog(3,"\npath %s :hit the error threshold\n",pp->dev); + + if(!pp->disable_reinstate){ + /*if the error threshold has hit hit within the san_path_err_threshold_window + * time frame donot reinstante the path till the san_path_err_recovery_time + * place the path in failed state till san_path_err_recovery_time so that the + * cutomer can rectify the issue within this time .Once the copletion of + * san_path_err_recovery_time it should automatically reinstantate the path + * */ + if((pp->mpp->san_path_err_threshold_window > 0) && + ((start_time.tv_sec - pp->failure_start_time) < pp->mpp->san_path_err_threshold_window)){ + condlog(3,"\npath %s :hit the error threshold within the thrshold window time\n",pp->dev); + disable_reinstate = 1; + pp->dis_reinstante_time = start_time.tv_sec ; + pp->disable_reinstate = 1; + }else{ + /*even though the number of errors are greater then the san_path_err_threshold + *since it doesnot hit within the san_path_err_threshold_window time we should not take these + * errros into account and we have to rewatch the errors + */ + pp->path_failures = 0; + pp->disable_reinstate = 0; + + } + } + if(pp->disable_reinstate){ + disable_reinstate = 1; + if((pp->mpp->san_path_err_recovery_time > 0) && + (start_time.tv_sec - pp->dis_reinstante_time ) > pp->mpp->san_path_err_recovery_time){ + disable_reinstate =0; + pp->path_failures = 0; + pp->disable_reinstate = 0; + condlog(3,"\npath %s :reinstate the path after err recovery time\n",pp->dev); + } + } + } + return disable_reinstate; +} /* * Returns '1' if the path has been checked, '-1' if it was blacklisted * and '0' otherwise @@ -1503,7 +1550,11 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) int retrigger_tries, checkint; struct config *conf; int ret; + struct timespec start_time; + if (clock_gettime(CLOCK_MONOTONIC, &start_time) != 0) + start_time.tv_sec = 0; + if ((pp->initialized == INIT_OK || pp->initialized == INIT_REQUESTED_UDEV) && !pp->mpp) return 0; @@ -1615,12 +1666,18 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) * and if target supports only implicit tpgs mode. * this will prevent unnecessary i/o by dm on stand-by * paths if there are no other active paths in map. + * + * when path failures has exceeded the san_path_err_threshold + * within san_path_err_threshold_window then we don't reinstate + * failed path for san_path_err_recovery_time */ - disable_reinstate = (newstate == PATH_GHOST && + disable_reinstate = ((newstate == PATH_GHOST && pp->mpp->nr_active == 0 && - pp->tpgs == TPGS_IMPLICIT) ? 1 : 0; + pp->tpgs == TPGS_IMPLICIT) ? 1 : + check_path_validity_err(pp)); pp->chkrstate = newstate; + if (newstate != pp->state) { int oldstate = pp->state; pp->state = newstate;