[dm-devel] [PATCH] multipath: fix scsi async tur checker corruption
Hannes Reinecke
hare at suse.de
Wed Dec 21 08:17:52 UTC 2011
On 12/21/2011 12:01 AM, Benjamin Marzinski wrote:
> Since the tur checker runs asynchronously in its own thread, there is nothing
> that keeps a path from being orphaned or deleted before the tur thread has
> finished. When this happenes the checker struct gets deleted. However, the tur
> thread might still we writing to that memory. This can lead to memory
> corruption. This patch adds all of the necessary data to the checker context,
> and makes the tur thread only use that. This way, if the checker is deleted
> while the thread is still using the context, the thread will clean up the
> context itself.
>
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
> libmultipath/checkers/tur.c | 85 +++++++++++++++++++++++++++++++++-----------
> libmultipath/discovery.c | 1
> 2 files changed, 65 insertions(+), 21 deletions(-)
>
> Index: multipath-tools-111219/libmultipath/checkers/tur.c
> ===================================================================
> --- multipath-tools-111219.orig/libmultipath/checkers/tur.c
> +++ multipath-tools-111219/libmultipath/checkers/tur.c
> @@ -35,10 +35,15 @@ struct tur_checker_context {
> dev_t devt;
> int state;
> int running;
> - time_t timeout;
> + int fd;
> + unsigned int timeout;
> + time_t time;
> pthread_t thread;
> pthread_mutex_t lock;
> pthread_cond_t active;
> + pthread_spinlock_t hldr_lock;
> + int holders;
> + char message[CHECKER_MSG_LEN];
> };
>
> #define TUR_DEVT(c) major((c)->devt), minor((c)->devt)
> @@ -53,28 +58,49 @@ int libcheck_init (struct checker * c)
> memset(ct, 0, sizeof(struct tur_checker_context));
>
> ct->state = PATH_UNCHECKED;
> + ct->fd = -1;
> + ct->holders = 1;
> pthread_cond_init(&ct->active, NULL);
> pthread_mutex_init(&ct->lock, NULL);
> + pthread_spin_init(&ct->hldr_lock, PTHREAD_PROCESS_PRIVATE);
> c->context = ct;
>
> return 0;
> }
>
> +void cleanup_context(struct tur_checker_context *ct)
> +{
> + pthread_mutex_destroy(&ct->lock);
> + pthread_cond_destroy(&ct->active);
> + pthread_spin_destroy(&ct->hldr_lock);
> + free(ct);
> +}
> +
> void libcheck_free (struct checker * c)
> {
> if (c->context) {
> struct tur_checker_context *ct = c->context;
> + int holders;
> + pthread_t thread;
>
> - pthread_mutex_destroy(&ct->lock);
> - pthread_cond_destroy(&ct->active);
> - free(c->context);
> + pthread_spin_lock(&ct->hldr_lock);
> + ct->holders--;
> + holders = ct->holders;
> + thread = ct->thread;
> + pthread_spin_unlock(&ct->hldr_lock);
> + if (holders)
> + pthread_cancel(thread);
> + else
> + cleanup_context(ct);
> c->context = NULL;
> }
> return;
> }
>
> +#define TUR_MSG(msg, fmt, args...) snprintf(msg, CHECKER_MSG_LEN, fmt, ##args);
> +
> int
> -tur_check (struct checker * c)
> +tur_check(int fd, unsigned int timeout, char *msg)
> {
> struct sg_io_hdr io_hdr;
> unsigned char turCmdBlk[TUR_CMD_LEN] = { 0x00, 0, 0, 0, 0, 0 };
> @@ -90,10 +116,10 @@ tur_check (struct checker * c)
> io_hdr.dxfer_direction = SG_DXFER_NONE;
> io_hdr.cmdp = turCmdBlk;
> io_hdr.sbp = sense_buffer;
> - io_hdr.timeout = c->timeout;
> + io_hdr.timeout = timeout;
> io_hdr.pack_id = 0;
> - if (ioctl(c->fd, SG_IO, &io_hdr) < 0) {
> - MSG(c, MSG_TUR_DOWN);
> + if (ioctl(fd, SG_IO, &io_hdr) < 0) {
> + TUR_MSG(msg, MSG_TUR_DOWN);
> return PATH_DOWN;
> }
> if ((io_hdr.status & 0x7e) == 0x18) {
> @@ -101,7 +127,7 @@ tur_check (struct checker * c)
> * SCSI-3 arrays might return
> * reservation conflict on TUR
> */
> - MSG(c, MSG_TUR_UP);
> + TUR_MSG(msg, MSG_TUR_UP);
> return PATH_UP;
> }
> if (io_hdr.info & SG_INFO_OK_MASK) {
> @@ -146,14 +172,14 @@ tur_check (struct checker * c)
> * LOGICAL UNIT NOT ACCESSIBLE,
> * TARGET PORT IN STANDBY STATE
> */
> - MSG(c, MSG_TUR_GHOST);
> + TUR_MSG(msg, MSG_TUR_GHOST);
> return PATH_GHOST;
> }
> }
> - MSG(c, MSG_TUR_DOWN);
> + TUR_MSG(msg, MSG_TUR_DOWN);
> return PATH_DOWN;
> }
> - MSG(c, MSG_TUR_UP);
> + TUR_MSG(msg, MSG_TUR_UP);
> return PATH_UP;
> }
>
> @@ -162,18 +188,25 @@ tur_check (struct checker * c)
>
> void cleanup_func(void *data)
> {
> + int holders;
> struct tur_checker_context *ct = data;
> + pthread_spin_lock(&ct->hldr_lock);
> + ct->holders--;
> + holders = ct->holders;
> ct->thread = 0;
> + pthread_spin_unlock(&ct->hldr_lock);
> + if (!holders)
> + cleanup_context(ct);
> }
>
> void *tur_thread(void *ctx)
> {
> - struct checker *c = ctx;
> - struct tur_checker_context *ct = c->context;
> + struct tur_checker_context *ct = ctx;
> int state;
>
> condlog(3, "%d:%d: tur checker starting up", TUR_DEVT(ct));
>
> + ct->message[0] = '\0';
> /* This thread can be canceled, so setup clean up */
> tur_thread_cleanup_push(ct)
>
> @@ -182,7 +215,7 @@ void *tur_thread(void *ctx)
> ct->state = PATH_PENDING;
> pthread_mutex_unlock(&ct->lock);
>
> - state = tur_check(c);
> + state = tur_check(ct->fd, ct->timeout, ct->message);
>
> /* TUR checker done */
> pthread_mutex_lock(&ct->lock);
> @@ -213,7 +246,7 @@ void tur_set_async_timeout(struct checke
> struct timeval now;
>
> gettimeofday(&now, NULL);
> - ct->timeout = now.tv_sec + c->timeout;
> + ct->time = now.tv_sec + c->timeout;
> }
>
> int tur_check_async_timeout(struct checker *c)
> @@ -222,7 +255,7 @@ int tur_check_async_timeout(struct check
> struct timeval now;
>
> gettimeofday(&now, NULL);
> - return (now.tv_sec > ct->timeout);
> + return (now.tv_sec > ct->time);
> }
>
> extern int
> @@ -242,7 +275,7 @@ libcheck_check (struct checker * c)
> ct->devt = sb.st_rdev;
>
> if (c->sync)
> - return tur_check(c);
> + return tur_check(c->fd, c->timeout, c->message);
>
> /*
> * Async mode
> @@ -276,6 +309,8 @@ libcheck_check (struct checker * c)
> /* TUR checker done */
> ct->running = 0;
> tur_status = ct->state;
> + strncpy(c->message, ct->message, CHECKER_MSG_LEN);
> + c->message[CHECKER_MSG_LEN - 1] = '\0';
> }
> pthread_mutex_unlock(&ct->lock);
> } else {
> @@ -284,24 +319,32 @@ libcheck_check (struct checker * c)
> pthread_mutex_unlock(&ct->lock);
> condlog(3, "%d:%d: tur thread not responding, "
> "using sync mode", TUR_DEVT(ct));
> - return tur_check(c);
> + return tur_check(c->fd, c->timeout, c->message);
> }
> /* Start new TUR checker */
> ct->state = PATH_UNCHECKED;
> + ct->fd = c->fd;
> + ct->timeout = c->timeout;
> + pthread_spin_lock(&ct->hldr_lock);
> + ct->holders++;
> + pthread_spin_unlock(&ct->hldr_lock);
> tur_set_async_timeout(c);
> setup_thread_attr(&attr, 32 * 1024, 1);
> - r = pthread_create(&ct->thread, &attr, tur_thread, c);
> + r = pthread_create(&ct->thread, &attr, tur_thread, ct);
> if (r) {
> pthread_mutex_unlock(&ct->lock);
> ct->thread = 0;
> + ct->holders--;
> condlog(3, "%d:%d: failed to start tur thread, using"
> " sync mode", TUR_DEVT(ct));
> - return tur_check(c);
> + return tur_check(c->fd, c->timeout, c->message);
> }
> pthread_attr_destroy(&attr);
> tur_timeout(&tsp);
> r = pthread_cond_timedwait(&ct->active, &ct->lock, &tsp);
> tur_status = ct->state;
> + strncpy(c->message, ct->message,CHECKER_MSG_LEN);
> + c->message[CHECKER_MSG_LEN -1] = '\0';
> pthread_mutex_unlock(&ct->lock);
> if (ct->thread &&
> (tur_status == PATH_PENDING || tur_status == PATH_UNCHECKED)) {
> Index: multipath-tools-111219/libmultipath/discovery.c
> ===================================================================
> --- multipath-tools-111219.orig/libmultipath/discovery.c
> +++ multipath-tools-111219/libmultipath/discovery.c
> @@ -826,6 +826,7 @@ get_state (struct path * pp, int daemon)
> }
> checker_set_fd(c, pp->fd);
> if (checker_init(c, pp->mpp?&pp->mpp->mpcontext:NULL)) {
> + memset(c, 0x0, sizeof(struct checker));
> condlog(3, "%s: checker init failed", pp->dev);
> return PATH_UNCHECKED;
> }
>
Hmm. Why do we need ->holders here?
In theory we can have only two states; async thread running or no
thread running. I really cannot imagine a scenario where ->holders
would be > 1.
Or, to stress the point a bit more, any scenario where holders > 1
would most definitely be a bug, as this would indicate that an async
tur thread is running, but we somehow failed to notice this, and
started a new one. Which means we end with two threads doing exactly
the same thing. And which was _exactly_ what we want to avoid with
at startup.
So I guess we can do away with ->holders.
And once ->holders is gone, I doubt it'll make any sense to retain
the spinlock, as we then just have ->thread to worry about.
And this doesn't need to be spinlock protected, as we take care to
synchronize during startup. And during shutdown we simply don't
care, as pthread_cancel() will return an error if the thread is
already done for.
Cheers,
Hannes
--
Dr. Hannes Reinecke zSeries & Storage
hare at suse.de +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
More information about the dm-devel
mailing list