diff -urN linux-2.6.12-00002/drivers/md/dm-raid1.c linux-2.6.12-00003/drivers/md/dm-raid1.c --- linux-2.6.12-00002/drivers/md/dm-raid1.c 2005-07-20 16:25:50.670999981 -0500 +++ linux-2.6.12-00003/drivers/md/dm-raid1.c 2005-07-20 16:32:08.092180248 -0500 @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-list.h" +#include "dm-bio-record.h" #include "dm-io.h" #include "dm-log.h" #include "kcopyd.h" @@ -573,24 +574,39 @@ struct mirror mirror[0]; }; +struct bio_map_info { + struct mirror *bmi_m; + struct dm_bio_details bmi_bd; +}; + +static mempool_t *bio_map_info_pool = NULL; + +static void *bio_map_info_alloc(unsigned int gfp_mask, void *pool_data){ + return kmalloc(sizeof(struct bio_map_info), gfp_mask); +} + +static void bio_map_info_free(void *element, void *pool_data){ + kfree(element); +} + /* * Every mirror should look like this one. */ #define DEFAULT_MIRROR 0 /* - * This is yucky. We squirrel the mirror_set struct away inside - * bi_next for write buffers. This is safe since the bh + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer. */ -static struct mirror_set *bio_get_ms(struct bio *bio) +static struct mirror *bio_get_m(struct bio *bio) { - return (struct mirror_set *) bio->bi_next; + return (struct mirror *) bio->bi_next; } -static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +static void bio_set_m(struct bio *bio, struct mirror *m) { - bio->bi_next = (struct bio *) ms; + bio->bi_next = (struct bio *) m; } /*----------------------------------------------------------------- @@ -752,37 +768,95 @@ choose_mirror(m->ms, m); } +static int default_ok(struct mirror *m) +{ + return !atomic_read(&m->ms->default_mirror->error_count); +} + /* * remap a buffer to a particular mirror. */ -static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + return m->offset + (bio->bi_sector - m->ms->ti->begin); +} + +static void map_bio(struct mirror *m, struct bio *bio) { bio->bi_bdev = m->dev->bdev; - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); + bio->bi_sector = map_sector(m, bio); +} + +static void map_region(struct io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio->bi_size >> 9; } /*----------------------------------------------------------------- * Reads *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = (struct bio *)context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (unlikely(error)) { + DMWARN("A read failure occurred on a mirror device."); + fail_mirror(m); + if (likely(default_ok(m))) { + DMWARN("Trying different device."); + queue_bio(m->ms, bio, bio_rw(bio)); + } else { + DMERR("No other device available, failing I/O."); + bio_endio(bio, 0, -EIO); + } + } else + bio_endio(bio, bio->bi_size, 0); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct io_region io; + + map_region(&io, m, bio); + bio_set_m(bio, m); + dm_io_async_bvec(1, &io, READ, + bio->bi_io_vec + bio->bi_idx, + read_callback, bio); +} + static void do_reads(struct mirror_set *ms, struct bio_list *reads) { - region_t region; struct bio *bio; struct mirror *m; while ((bio = bio_list_pop(reads))) { - region = bio_to_region(&ms->rh, bio); - /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 0)) + if (likely(rh_in_sync(&ms->rh, + bio_to_region(&ms->rh, bio), + 0) == RH_CLEAN)) m = choose_mirror(ms, NULL); - else + else { m = ms->default_mirror; - map_bio(ms, m, bio); - generic_make_request(bio); + /* If the default fails, we give up .*/ + if (unlikely(m && atomic_read(&m->error_count))) + m = NULL; + } + + if (likely(m)) + read_async_bio(m, bio); + else + bio_endio(bio, 0, -EIO); } } @@ -822,8 +896,8 @@ struct mirror_set *ms; int uptodate = 0, run; - ms = bio_get_ms(bio); - bio_set_ms(bio, NULL); + ms = (bio_get_m(bio))->ms; + bio_set_m(bio, NULL); /* * NOTE: We don't decrement the pending count here, @@ -885,21 +959,24 @@ static void do_write(struct mirror_set *ms, struct bio *bio) { unsigned int i; - struct io_region io[KCOPYD_MAX_REGIONS+1]; + struct io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; - for (i = 0; i < ms->nr_mirrors; i++) { - m = ms->mirror + i; + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); - io[i].bdev = m->dev->bdev; - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); - io[i].count = bio->bi_size >> 9; - } - - bio_set_ms(bio, ms); - dm_io_async_bvec(ms->nr_mirrors, io, WRITE, - bio->bi_io_vec + bio->bi_idx, - write_callback, bio); + if (likely(dest - io)) { + /* + * We can use the default mirror here, because we + * only need it in order to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, ms->default_mirror); + dm_io_async_bvec(dest - io, io, WRITE, + bio->bi_io_vec + bio->bi_idx, + write_callback, bio); + } else + bio_endio(bio, bio->bi_size, -EIO); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) @@ -957,7 +1034,7 @@ rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, ms->default_mirror, bio); + map_bio(ms->default_mirror, bio); generic_make_request(bio); } } @@ -1246,42 +1323,64 @@ int r, rw = bio_rw(bio); struct mirror *m; struct mirror_set *ms = ti->private; - - map_context->ll = bio->bi_sector >> ms->rh.region_shift; + struct dm_bio_details *bd; + struct bio_map_info *bmi; if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + map_context->ll = bio_to_region(&ms->rh, bio); queue_bio(ms, bio, rw); return 0; } + /* It's all about the READs now */ + r = ms->rh.log->type->in_sync(ms->rh.log, bio_to_region(&ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) return r; - if (r == -EWOULDBLOCK) /* FIXME: ugly */ + if (r == -EWOULDBLOCK) r = 0; - /* - * We don't want to fast track a recovery just for a read - * ahead. So we just let it silently fail. - * FIXME: get rid of this. - */ - if (!r && rw == READA) - return -EIO; + if (likely(r)) { + /* + * Optimize reads by avoiding to hand them to daemon. + * + * In case they fail, queue them for another shot + * in the mirror_end_io() function. + */ + m = choose_mirror(ms, NULL); + if (likely(m)) { + bmi = mempool_alloc(bio_map_info_pool, GFP_KERNEL); + + if (likely(bmi)) { + /* without this, a read is not retryable */ + bd = &bmi->bmi_bd; + dm_bio_record(bd, bio); + map_context->ptr = bmi; + bmi->bmi_m = m; + } else { + /* we could fail now, but we can at least ** + ** give it a shot. The bd is only used to ** + ** retry in the event of a failure anyway. ** + ** If we fail, we can fail the I/O then. */ + map_context->ptr = NULL; + } + + map_bio(m, bio); + return 1; /* Mapped -> queue request. */ + } else + return -EIO; + } else { + /* Either not clean, or -EWOULDBLOCK */ + if (rw == READA) + return -EWOULDBLOCK; - if (!r) { - /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); - return 0; } - m = choose_mirror(ms, NULL); - if (!m) - return -EIO; - - map_bio(ms, m, bio); - return 1; + return 0; } static int mirror_end_io(struct dm_target *ti, struct bio *bio, @@ -1289,15 +1388,53 @@ { int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; - region_t region = map_context->ll; + struct mirror *m = NULL; /* * We need to dec pending if this was a write. */ - if (rw == WRITE) - rh_dec(&ms->rh, region); + if (rw == WRITE) { + rh_dec(&ms->rh, map_context->ll); + return error; + } - return 0; + if (unlikely(error)) { + struct dm_bio_details *bd = NULL; + + DMERR("A read failure occurred on a mirror device."); + if (!map_context->ptr) { + /* + * There wasn't enough memory to record necessary + * information for a retry. + */ + DMERR("Out of memory causing inability to retry read."); + return -EIO; + } + m = ((struct bio_map_info *)map_context->ptr)->bmi_m; + fail_mirror(m); /* Flag error on mirror. */ + + /* + * A failed read needs to get queued + * to the daemon for another shot to + * one (if any) intact mirrors. + */ + if (rw == READ && default_ok(m)) { + bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd); + + DMWARN("Trying different device."); + dm_bio_restore(bd, bio); + mempool_free(map_context->ptr, bio_map_info_pool); + map_context->ptr = NULL; + queue_bio(ms, bio, rw); + return 1; /* We want another shot on the bio. */ + } + DMERR("All replicated volumes dead, failing I/O"); + } + + if (map_context->ptr) + mempool_free(map_context->ptr, bio_map_info_pool); + + return error; } static void mirror_presuspend(struct dm_target *ti) @@ -1402,6 +1539,11 @@ { int r; + bio_map_info_pool = mempool_create(100, bio_map_info_alloc, + bio_map_info_free, NULL); + if (!bio_map_info_pool) + return -ENOMEM; + r = dm_dirty_log_init(); if (r) return r;