diff -urN linux-2.6.12-00001/drivers/md/dm-raid1.c linux-2.6.12-00002/drivers/md/dm-raid1.c --- linux-2.6.12-00001/drivers/md/dm-raid1.c 2005-07-13 11:00:43.000000000 -0500 +++ linux-2.6.12-00002/drivers/md/dm-raid1.c 2005-07-20 16:25:50.670999981 -0500 @@ -28,6 +28,8 @@ queue_work(_kmirrord_wq, &_kmirrord_work); } +static struct workqueue_struct *_kmir_mon_wq; + /*----------------------------------------------------------------- * Region hash * @@ -539,7 +541,8 @@ * Mirror set structures. *---------------------------------------------------------------*/ struct mirror { - atomic_t error_count; + atomic_t error_count; /* Error counter to flag mirror failure */ + struct mirror_set *ms; struct dm_dev *dev; sector_t offset; }; @@ -550,16 +553,24 @@ struct region_hash rh; struct kcopyd_client *kcopyd_client; - spinlock_t lock; /* protects the next two lists */ + spinlock_t lock; /* protects the lists */ struct bio_list reads; struct bio_list writes; + struct bio_list failures; + struct work_struct failure_work; + struct completion failure_completion; /* recovery */ + atomic_t suspended; region_t nr_regions; int in_sync; unsigned int nr_mirrors; - struct mirror mirror[0]; + spinlock_t choose_lock; /* protects select in choose_mirror(). */ + atomic_t read_count; /* Read counter for read balancing. */ + unsigned int read_mirror; /* Last mirror read. */ + struct mirror *default_mirror; /* Default mirror. */ + struct mirror mirror[0]; }; /* @@ -607,7 +618,7 @@ unsigned long flags = 0; /* fill in the source */ - m = ms->mirror + DEFAULT_MIRROR; + m = ms->default_mirror; from.bdev = m->dev->bdev; from.sector = m->offset + region_to_sector(reg->rh, reg->key); if (reg->key == (ms->nr_regions - 1)) { @@ -623,7 +634,7 @@ /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (i == DEFAULT_MIRROR) + if (&ms->mirror[i] == ms->default_mirror) continue; m = ms->mirror + i; @@ -673,12 +684,72 @@ } /*----------------------------------------------------------------- - * Reads + * Misc Functions *---------------------------------------------------------------*/ -static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +#define MIN_READS 128 +/* + * choose_mirror + * @ms: the mirror set + * @m: mirror that has failed, or NULL if just choosing + * + * Returns: chosen mirror, or NULL on failure + */ +static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m) +{ + int i, retry; + unsigned long flags; + struct mirror *ret = NULL; + + spin_lock_irqsave(&ms->choose_lock, flags); + + if (unlikely(m == ms->default_mirror)) { + i = DEFAULT_MIRROR; + atomic_set(&ms->read_count, MIN_READS); + } else + i = ms->read_mirror; + + for (retry = 0; retry < ms->nr_mirrors; ) { + i %= ms->nr_mirrors; + ret = ms->mirror + i; + + if (unlikely(atomic_read(&ret->error_count))) { + retry++; + i++; + } else { + /* + * Guarantee that a number of read IOs + * get queued to the same mirror. + */ + if (atomic_dec_and_test(&ms->read_count)) { + atomic_set(&ms->read_count, MIN_READS); + i++; + } + + ms->read_mirror = i; + break; + } + } + + /* Check for failure of default mirror, reset if necessary */ + if (unlikely(m == ms->default_mirror)) + ms->default_mirror = ret; + + spin_unlock_irqrestore(&ms->choose_lock, flags); + + if (unlikely(atomic_read(&ret->error_count))) { + DMERR("All mirror devices are dead. Unable to choose mirror."); + return NULL; + } + + return ret; +} + +static void fail_mirror(struct mirror *m) { - /* FIXME: add read balancing */ - return ms->mirror + DEFAULT_MIRROR; + DMINFO("incrementing error_count on %s", m->dev->name); + atomic_inc(&m->error_count); + + choose_mirror(m->ms, m); } /* @@ -690,6 +761,9 @@ bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); } +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ static void do_reads(struct mirror_set *ms, struct bio_list *reads) { region_t region; @@ -703,9 +777,9 @@ * We can only read balance if the region is in sync. */ if (rh_in_sync(&ms->rh, region, 0)) - m = choose_mirror(ms, bio->bi_sector); + m = choose_mirror(ms, NULL); else - m = ms->mirror + DEFAULT_MIRROR; + m = ms->default_mirror; map_bio(ms, m, bio); generic_make_request(bio); @@ -722,36 +796,90 @@ * RECOVERING: delay the io until recovery completes * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ +static void write_failure_handler(void *data) +{ + struct bio *bio; + struct bio_list failed_writes; + struct mirror_set *ms = (struct mirror_set *)data; + + dm_table_event(ms->ti->table); + wait_for_completion(&ms->failure_completion); + + /* Take list out to handle endios. */ + spin_lock_irq(&ms->lock); + failed_writes = ms->failures; + bio_list_init(&ms->failures); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&failed_writes))) + bio_endio(bio, bio->bi_size, 0); +} + static void write_callback(unsigned long error, void *context) { - unsigned int i; - int uptodate = 1; + unsigned int i, ret = 0; struct bio *bio = (struct bio *) context; struct mirror_set *ms; - + int uptodate = 0, run; + ms = bio_get_ms(bio); bio_set_ms(bio, NULL); - + /* * NOTE: We don't decrement the pending count here, * instead it is done by the targets endio function. * This way we handle both writes to SYNC and NOSYNC * regions with the same code. */ + if (unlikely(error)) { + DMERR("Error during write occurred."); - if (error) { /* - * only error the io if all mirrors failed. - * FIXME: bogus + * Test all bits - if all failed, fail io. + * Otherwise, go through hassle of failing a device... */ - uptodate = 0; - for (i = 0; i < ms->nr_mirrors; i++) - if (!test_bit(i, &error)) { + for (i = 0; i < ms->nr_mirrors; i++) { + if (test_bit(i, &error)) + fail_mirror(ms->mirror + i); + else uptodate = 1; - break; + } + + if (likely(uptodate)) { + spin_lock(&ms->lock); + if (atomic_read(&ms->suspended)) { + /* + * The device is suspended, it is + * safe to complete I/O. + */ + spin_unlock(&ms->lock); + } else { + /* + * Need to raise event. Since raising + * events can block, we need to do it in + * seperate thread. + * + * run gets set if this will be the first + * bio in the list. + */ + run = !ms->failures.head; + bio_list_add(&ms->failures, bio); + spin_unlock(&ms->lock); + + if (run) + queue_work(_kmir_mon_wq, + &ms->failure_work); + + return; } + } else { + DMERR("All replicated volumes dead, failing I/O"); + /* None of the writes succeeded, fail the I/O. */ + ret = -EIO; + } } - bio_endio(bio, bio->bi_size, 0); + + bio_endio(bio, bio->bi_size, ret); } static void do_write(struct mirror_set *ms, struct bio *bio) @@ -829,7 +957,7 @@ rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); + map_bio(ms, ms->default_mirror, bio); generic_make_request(bio); } } @@ -891,11 +1019,16 @@ memset(ms, 0, len); spin_lock_init(&ms->lock); + spin_lock_init(&ms->choose_lock); ms->ti = ti; ms->nr_mirrors = nr_mirrors; ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; + ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; + + /* a resume must be issued to start the device */ + atomic_set(&ms->suspended, 1); if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "dm-mirror: Error creating dirty region hash"; @@ -903,6 +1036,13 @@ return NULL; } + atomic_set(&ms->read_count, MIN_READS); + + bio_list_init(&ms->failures); + INIT_WORK(&ms->failure_work, write_failure_handler, ms); + + init_completion(&ms->failure_completion); + return ms; } @@ -940,6 +1080,8 @@ } ms->mirror[mirror].offset = offset; + atomic_set(&(ms->mirror[mirror].error_count), 0); + ms->mirror[mirror].ms = ms; return 0; } @@ -1134,7 +1276,7 @@ return 0; } - m = choose_mirror(ms, bio->bi_sector); + m = choose_mirror(ms, NULL); if (!m) return -EIO; @@ -1162,10 +1304,24 @@ { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + unsigned long flags; + int run; + + /* + * Only run the completion if we are suspending after + * a disk failure. + */ + spin_lock_irqsave(&ms->lock, flags); + run = ms->failures.head ? 1 : 0; + spin_unlock_irqrestore(&ms->lock, flags); + + if (run) + complete(&ms->failure_completion); if (log->type->presuspend && log->type->presuspend(log)) /* FIXME: need better error handling */ DMWARN("log presuspend failed"); + } static void mirror_postsuspend(struct dm_target *ti) @@ -1177,38 +1333,48 @@ if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ DMWARN("log postsuspend failed"); + atomic_set(&ms->suspended, 1); } static void mirror_resume(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); - rh_start_recovery(&ms->rh); + + if (atomic_dec_and_test(&ms->suspended)) + rh_start_recovery(&ms->rh); + atomic_set(&ms->suspended, 0); } static int mirror_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - unsigned int m, sz; + unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; - - sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + char buffer[ms->nr_mirrors + 1]; switch (type) { case STATUSTYPE_INFO: DMEMIT("%d ", ms->nr_mirrors); - for (m = 0; m < ms->nr_mirrors; m++) + for (m = 0; m < ms->nr_mirrors; m++) { DMEMIT("%s ", ms->mirror[m].dev->name); + buffer[m] = atomic_read(&(ms->mirror[m].error_count)) ? + 'D' : 'A'; + } + buffer[m] = '\0'; - DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT, + DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT " 1 %s ", ms->rh.log->type->get_sync_count(ms->rh.log), - ms->nr_regions); + ms->nr_regions, buffer); + ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); break; case STATUSTYPE_TABLE: + sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); DMEMIT("%d ", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) DMEMIT("%s " SECTOR_FORMAT " ", @@ -1226,6 +1392,7 @@ .dtr = mirror_dtr, .map = mirror_map, .end_io = mirror_end_io, + .presuspend = mirror_presuspend, .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, @@ -1243,16 +1410,25 @@ if (!_kmirrord_wq) { DMERR("couldn't start kmirrord"); dm_dirty_log_exit(); - return r; + return -ENOMEM; } INIT_WORK(&_kmirrord_work, do_work, NULL); + _kmir_mon_wq = create_workqueue("kmir_mon"); + if (!_kmir_mon_wq) { + DMERR("couldn't start kmir_mon"); + dm_dirty_log_exit(); + destroy_workqueue(_kmirrord_wq); + return -ENOMEM; + } + r = dm_register_target(&mirror_target); if (r < 0) { DMERR("%s: Failed to register mirror target", mirror_target.name); dm_dirty_log_exit(); destroy_workqueue(_kmirrord_wq); + destroy_workqueue(_kmir_mon_wq); } return r;