[dm-devel] patch to dm-emc.c

Thu Aug 10 07:43:15 UTC 2006

egoggin at emc.com wrote:
>  
> +	/*
> +	 * Link all paths of mapped device to the same hwh context.
> +	 */
> +	hwh = &m->hw_handler;
> +	if (hwh->type) {
> +		list_for_each_entry(pg, &m->priority_groups, list) {
> +			list_for_each_entry(pgpath, &pg->pgpaths, list) {
> +				(path = &pgpath->path)->hwhcontext =
> hwh->context;

This and some other places got wrapped.

> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Initialize the hardware context now that all paths are setup,
> +	 * pass any one of the paths to be used to access the device.
> +	 */
> +	if (hwh->type && hwh->type->init && path) {
> +		/* pass mapped device name for event logging */
> +		const char *name =
> dm_device_name(dm_table_get_md(ti->table));
> +		hwh->type->init(hwh, name, path);
> +		dm_table_put_md(ti->table);
> +	}
> +
>  	ti->private = m;
> -	m->ti = ti;
>  
>  	return 0;
>  
> 
> --- drivers/md/dm-hw-handler.h.orig	2006-08-03 04:04:54.000000000 -0500
> +++ drivers/md/dm-hw-handler.h	2006-08-03 04:04:54.000000000 -0500
> @@ -29,6 +29,8 @@
>  
>  	int (*create) (struct hw_handler *handler, unsigned int argc,
>  		       char **argv);
> +	void (*init) (struct hw_handler *hwh, const char *name,
> +		      struct path *path);
>  	void (*destroy) (struct hw_handler *hwh);
>  
>  	void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
> 
> 
> --- drivers/md/dm-emc.c.orig	2006-08-03 04:04:54.000000000 -0500
> +++ drivers/md/dm-emc.c	2006-08-06 20:40:48.000000000 -0500
> @@ -10,224 +10,391 @@
>  #include "dm.h"
>  #include "dm-hw-handler.h"
>  #include <scsi/scsi.h>
> +#include <scsi/scsi_eh.h>
>  #include <scsi/scsi_cmnd.h>
>  
> -#define DM_MSG_PREFIX "multipath emc"
> +#define DM_MSG_PREFIX		"multipath emc"
> +#define TRESPASS_PAGE		0x22
> +#define BUFFER_SIZE		0x80
> +#define EMC_HANDLER_TIMEOUT	(60 * HZ)
> +#define	UNBOUND_LU		-1
> +
> +/*
> + * Four variations of the CLARiiON trespass MODE_SENSE page.
> + */
> +unsigned char long_trespass_and_hr_pg[] = {
> +	0, 0, 0, 0,
> +	TRESPASS_PAGE,        /* Page code */
> +	0x09,                 /* Page length - 2 */
> +	0x01,  		      /* Trespass code + Honor reservation bit */
> +	0xff, 0xff,           /* Trespass target */
> +	0, 0, 0, 0, 0, 0      /* Reserved bytes / unknown */
> +};
> +unsigned char long_trespass_pg[] = {
> +	0, 0, 0, 0,
> +	TRESPASS_PAGE,        /* Page code */
> +	0x09,                 /* Page length - 2 */
> +	0x81,  		      /* Trespass code + Honor reservation bit */
> +	0xff, 0xff,           /* Trespass target */
> +	0, 0, 0, 0, 0, 0      /* Reserved bytes / unknown */
> +};
> +unsigned char short_trespass_and_hr_pg[] = {
> +	0, 0, 0, 0,
> +	TRESPASS_PAGE,        /* Page code */
> +	0x02,                 /* Page length - 2 */
> +	0x01,  		      /* Trespass code + Honor reservation bit */
> +	0xff,                 /* Trespass target */
> +};
> +unsigned char short_trespass_pg[] = {
> +	0, 0, 0, 0,
> +	TRESPASS_PAGE,        /* Page code */
> +	0x02,                 /* Page length - 2 */
> +	0x81,  		      /* Trespass code + Honor reservation bit */
> +	0xff,                 /* Trespass target */
> +};
>  
> +/*
> + * EMC hardware handler context structure containing CLARiiON LU specific
> + * information for a particular dm multipath mapped device.
> + */
>  struct emc_handler {
>  	spinlock_t lock;
> -
> -	/* Whether we should send the short trespass command (FC-series)
> -	 * or the long version (default for AX/CX CLARiiON arrays). */
> +	/* name of mapped device */
> +	char name[16];
> +	struct hw_handler *hwh;
> +	struct work_struct wkq;
> +	struct request req;

You should not have to do this should you? The queue has a mempool of
requests. The only way it could be exhausted is if some app is hogging
them by doing SG_IO (since dm bd_claims the device), you do not use
GFP_WAIT in your allocation, and you really hit some nasty case where
your process is starved because you keep missing the chance to allocate
a request that is freed and put back in the pool. If you are that
unlucky and that happens though, you would have problems elsewhere
though so maybe a generic solution to make sure no one gets starved
would be better.

> +	struct path *path;
> +	/* Use short trespass command (FC-series) or the long version
> +	 * (default for AX/CX CLARiiON arrays). */
>  	unsigned short_trespass;
> -	/* Whether or not to honor SCSI reservations when initiating a
> -	 * switch-over. Default: Don't. */
> +	/* Whether or not (default) to honor SCSI reservations when
> +	 * initiating a switch-over. */
>  	unsigned hr;
> -
> +	/* I/O buffer for both MODE_SELECT and INQUIRY commands. */
> +	char buffer[BUFFER_SIZE];
> +	/* SCSI sense buffer for commands -- assumes serial issuance
> +	 * and completion sequence of all commands for same multipath. */
>  	unsigned char sense[SCSI_SENSE_BUFFERSIZE];
> +	/* which SP (A=0,B=1,UNBOUND=-1) is default SP for path's mapped dev
> */
> +	int defaultSP;
> +	/* which SP (A=0,B=1,UNBOUND=-1) is active for path's mapped dev */
> +	int currentSP;
>  };
>  
> -#define TRESPASS_PAGE 0x22
> -#define EMC_FAILOVER_TIMEOUT (60 * HZ)
> +struct workqueue_struct *kemchd;
>  
>  /* Code borrowed from dm-lsi-rdac by Mike Christie */
>  
> -static inline void free_bio(struct bio *bio)
> +/*
> + * Get block request for REQ_BLOCK_PC command issued to path.  Currently
> + * limited to MODE_SENSE (trespass) and INQUIRY (VPD page 0xC0) commands.
> + *
> + * Uses data and sense buffers in hardware handler context structure and
> + * assumes serial servicing of commands, both issuance and completion.
> + */
> +static struct request *get_req(struct path *path, int opcode)
>  {
> -	__free_page(bio->bi_io_vec[0].bv_page);
> -	bio_put(bio);
> -}
> +	struct emc_handler *h = (struct emc_handler *)path->hwhcontext;
> +	struct request_queue *q = bdev_get_queue(path->dev->bdev);
> +	struct request *rq;
> +	void *buffer;
> +	int len = 0;
>  
> -static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
> -{
> -	struct path *path = bio->bi_private;
> +	/*
> +	 * Re-use the pre-allocated request struct in order to avoid
> deadlock
> +	 * scenarios trying to allocate a request on a target queue which is
> +	 * over its read or write request threshold.
> +	 */
> +	rq = &h->req;
> +	rq_init(q, rq);
> +	rq->elevator_private = NULL;
> +	rq->rq_disk = NULL;
> +	rq->rl = NULL;
> +	rq->flags = 0;
>  
> -	if (bio->bi_size)
> -		return 1;
> +	memset(&rq->cmd, 0, BLK_MAX_CDB);
> +	rq->cmd[0] = opcode;
> +	rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
>  
> -	/* We also need to look at the sense keys here whether or not to
> -	 * switch to the next PG etc.
> -	 *
> -	 * For now simple logic: either it works or it doesn't.
> -	 */
> -	if (error)
> -		dm_pg_init_complete(path, MP_FAIL_PATH);
> -	else
> -		dm_pg_init_complete(path, 0);
> +	switch (opcode) {
> +	case MODE_SELECT:
> +		rq->flags |= REQ_RW;
> +		rq->cmd[1] = 0x10;
> +		len = h->short_trespass ? sizeof(short_trespass_and_hr_pg) :
> +				        sizeof(long_trespass_and_hr_pg);
> +		buffer = h->short_trespass ?
> +					       h->hr ?
> short_trespass_and_hr_pg
> +			      	      		     : short_trespass_pg
> +				           :
> +					       h->hr ?
> long_trespass_and_hr_pg
> +			      			     : long_trespass_pg;
> +		/* Can't DMA from kernel BSS -- must copy selected trespass
> +		 * command mode page contents to context buffer which is
> +		 * allocated from kmalloc memory. */
> +		BUG_ON((len > BUFFER_SIZE));
> +		memcpy(h->buffer, buffer, len);
> +		break;
> +	case INQUIRY:
> +		rq->cmd[1] = 0x1;
> +		rq->cmd[2] = 0xC0;
> +		len = BUFFER_SIZE;
> +		memset(h->buffer, 0, BUFFER_SIZE);
> +		break;
> +	default:
> +		BUG_ON(1);
> +		break;
> +	}
> +	rq->cmd[4] = len;
> +
> +	rq->buffer = rq->data = h->buffer;

You should use one of the block layer helpers. And yes I know they
allocate mem so fix the bio code so that it is also fixed for the path
testers :)

> +	rq->data_len = len;
> +	rq->bio = rq->biotail = NULL;
>  
> -	/* request is freed in block layer */
> -	free_bio(bio);
> +	rq->sense = h->sense;
> +	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
> +	rq->sense_len = 0;
>  
> -	return 0;
> +	rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST);
> +	rq->timeout = EMC_HANDLER_TIMEOUT;
> +	rq->retries = 0;
> +
> +	return rq;
>  }
>  
> -static struct bio *get_failover_bio(struct path *path, unsigned data_size)
> +/*
> + * Initialize hwhandler context structure ... defaultSP and currentSP
> fields.
> + */
> +static int getSPInfo(struct emc_handler *h, struct path *path, int
> *defaultSP,
> +		     int *currentSP, int *newCurrentSP)
>  {

Here and throughout the rest of the patch, don't add mixed case and
follow the 80 col rule please.

> -	struct bio *bio;
> -	struct page *page;
> +	struct request_queue *q = bdev_get_queue(path->dev->bdev);
> +	struct request *rq;
> +	int err = 0;
>  
> -	bio = bio_alloc(GFP_ATOMIC, 1);
> -	if (!bio) {
> -		DMERR("get_failover_bio: bio_alloc() failed.");
> -		return NULL;
> +	/* get and initialize block request */
> +	rq = get_req(path, INQUIRY);
> +	if (!rq)
> +		return MP_FAIL_PATH;
> +
> +	/* issue the cmd (at head of q) & synchronously wait for completion
> */
> +	blk_execute_rq(q, NULL, rq, 1);
> +	if (rq->errors == 0) {
> +		/* check for in-progress ucode upgrade (NDU) */
> +		if (h->buffer[48] != 0) {
> +			DMWARN("getSPInfo: Detected in-progress ucode
> upgrade NDU operation while finding current active SP for mapped device %s
> using path %s.",
> +			       h->name, path->dev->name);
> +			err = MP_BYPASS_PG;
> +		}
> +		else {

the "else" here and other places should be on the same line as the "}".
See the coding style doc.

> +			*defaultSP = h->buffer[5];
> +
> +			if (h->buffer[4] == 2)
> +				/* SP for path (in h->buffer[8]) is current
> */
> +				*currentSP = h->buffer[8];
> +			else {
> +				if (h->buffer[4] == 1)
> +					/* SP for this path is NOT current
> */
> +					if (h->buffer[8] == 0)
> +						*currentSP = 1;
> +					else
> +						*currentSP = 0;
> +				else
> +					/* unbound LU or LUNZ */
> +					*currentSP = UNBOUND_LU;
> +			}
> +			*newCurrentSP =  h->buffer[8];
> +		}
>  	}
> +	else {
> +		struct scsi_sense_hdr sshdr;
>  
> -	bio->bi_rw |= (1 << BIO_RW);
> -	bio->bi_bdev = path->dev->bdev;
> -	bio->bi_sector = 0;
> -	bio->bi_private = path;
> -	bio->bi_end_io = emc_endio;
> +		err = MP_FAIL_PATH;
>  
> -	page = alloc_page(GFP_ATOMIC);
> -	if (!page) {
> -		DMERR("get_failover_bio: alloc_page() failed.");
> -		bio_put(bio);
> -		return NULL;
> +		if (rq->sense_len && scsi_normalize_sense(rq->sense,
> +						  	  rq->sense_len,
> &sshdr)) {
> +			DMERR("getSPInfo: Found valid sense data %02x, %2x,
> %2x while finding current active SP for mapped device %s using path %s.",
> +		       	       sshdr.sense_key, sshdr.asc, sshdr.ascq,
> +			       h->name, path->dev->name);
> +		}
> +		else DMERR("getSPInfo: Error 0x%x finding current active SP
> for mapped devicce %s using path %s.", rq->errors, h->name,
> path->dev->name);
>  	}
>  
> -	if (bio_add_page(bio, page, data_size, 0) != data_size) {
> -		DMERR("get_failover_bio: alloc_page() failed.");
> -		__free_page(page);
> -		bio_put(bio);
> -		return NULL;
> -	}
> +	blk_put_request(rq);
>  
> -	return bio;
> +	return err;
>  }
>  
> -static struct request *get_failover_req(struct emc_handler *h,
> -					struct bio *bio, struct path *path)
> +/* initialize path group command */
> +static int pgInit(struct emc_handler *h, struct path *path)
>  {
> +	struct request_queue *q = bdev_get_queue(path->dev->bdev);
> +	struct scsi_sense_hdr sshdr;
>  	struct request *rq;
> -	struct block_device *bdev = bio->bi_bdev;
> -	struct request_queue *q = bdev_get_queue(bdev);
> +	int err = 0;
>  
> -	/* FIXME: Figure out why it fails with GFP_ATOMIC. */
> -	rq = blk_get_request(q, WRITE, __GFP_WAIT);
> -	if (!rq) {
> -		DMERR("get_failover_req: blk_get_request failed");
> -		return NULL;
> +	/* get and initialize block request */
> +	rq = get_req(path, MODE_SELECT);
> +	if (!rq)
> +		return MP_FAIL_PATH;
> +
> +	DMINFO("pgInit: Sending switch-over command for mapped device %s
> using path %s.",
> +	       h->name, path->dev->name);
> +
> +	/* issue the cmd (at head of q) & synchronously wait for completion
> */
> +	blk_execute_rq(q, NULL, rq, 1); 

Why synchronously? For lot-o-devices we do not want to do do this do we?
We have functions and callbacks now to do this properly asynchronously.