[dm-devel] failover does not work with rdac device handler

Moger, Babu Babu.Moger at lsi.com
Tue Oct 7 18:34:01 UTC 2008


Hi Chandra,

I have both good news and bad news with this patch.

Good news is failover is happening (I am not seeing the earlier message "Cannot failover device because scsi_dh_rdac was not loaded").

Bad news is, I am seeing the below soft-panic.

  Oct  7 12:50:15 localhost kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000238
Oct  7 12:50:15 localhost kernel: IP: [<ffffffffa038e283>] rdac_bus_detach+0xd/0x9a [scsi_dh_rdac]
Oct  7 12:50:15 localhost kernel: PGD 0
Oct  7 12:50:15 localhost kernel: Oops: 0000 [1] SMP
Oct  7 12:50:15 localhost kernel: CPU 3
Oct  7 12:50:15 localhost kernel: Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp autofs4 i2c_dev i2c_core hidp rfcomm l2cap bluetooth sunrpc dm_round_robin scsi_dh_rdac dm_multipath scsi_dh sbs sbshc battery acpi_memhotplug ac ipv6 parport_pc lp parport joydev sg bnx2 ide_cd_mod cdrom button rtc_cmos dcdbas serio_raw rtc_core i5000_edac shpchp edac_core rtc_lib pcspkr dm_snapshot dm_zero dm_mirror dm_log dm_mod lpfc qla2xxx scsi_transport_fc ata_piix libata megaraid_sas sd_mod scsi_mod ext3 jbd uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode]
Oct  7 12:50:15 localhost kernel: Pid: 683, comm: fc_wq_3 Not tainted 2.6.27-rc7-babu #2
Oct  7 12:50:15 localhost kernel: RIP: 0010:[<ffffffffa038e283>]  [<ffffffffa038e283>] rdac_bus_detach+0xd/0x9a [scsi_dh_rdac]
Oct  7 12:50:15 localhost kernel: RSP: 0018:ffff88007d4b3cd0  EFLAGS: 00010282
Oct  7 12:50:15 localhost kernel: RAX: 0000000000000000 RBX: ffff88007d4b3730 RCX: ffffffff00007530
Oct  7 12:50:15 localhost kernel: RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff88007d4b3730
Oct  7 12:50:15 localhost kernel: RBP: ffffffffa0376000 R08: ffff88007d54ef50 R09: ffff88007fb79090
Oct  7 12:50:15 localhost kernel: R10: 0000000000000000 R11: ffffffff802fa1f5 R12: 0000000000000002
Oct  7 12:50:15 localhost kernel: R13: ffff88007db97920 R14: 0000000000000002 R15: ffff88007db97920
Oct  7 12:50:15 localhost kernel: FS:  0000000000000000(0000) GS:ffff88007f005640(0000) knlGS:0000000000000000
Oct  7 12:50:15 localhost kernel: CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
Oct  7 12:50:15 localhost kernel: CR2: 0000000000000238 CR3: 0000000000201000 CR4: 00000000000006e0
Oct  7 12:50:15 localhost kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Oct  7 12:50:15 localhost kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Oct  7 12:50:15 localhost kernel: Process fc_wq_3 (pid: 683, threadinfo ffff88007d4b2000, task ffff88007f1f88d0)
Oct  7 12:50:15 localhost kernel: Stack:  ffff88007db97920 ffff88007b0e1c08 ffffffffa0376000 ffffffffa0376029
Oct  7 12:50:15 localhost kernel:  ffff88007d54ef50 ffff88007c45f4b0 ffff88007b0e1c00 ffffffff8031e383
Oct  7 12:50:15 localhost kernel:  ffff88007db97800 ffff88007db97800 ffff88007db97920 ffffffffa0376443
Oct  7 12:50:15 localhost kernel: Call Trace:
Oct  7 12:50:15 localhost kernel:  [<ffffffffa0376000>] ? scsi_dh_release+0x0/0x2e [scsi_dh]
Oct  7 12:50:15 localhost kernel:  [<ffffffffa0376029>] ? scsi_dh_release+0x29/0x2e [scsi_dh]
Oct  7 12:50:15 localhost kernel:  [<ffffffff8031e383>] ? kref_put+0x41/0x4c
Oct  7 12:50:15 localhost kernel:  [<ffffffffa0376443>] ? scsi_dh_notifier+0x73/0x7a [scsi_dh]
Oct  7 12:50:15 localhost kernel:  [<ffffffff80483885>] ? notifier_call_chain+0x29/0x4c
Oct  7 12:50:15 localhost kernel:  [<ffffffff80249a4c>] ? __blocking_notifier_call_chain+0x48/0x62
Oct  7 12:50:15 localhost kernel:  [<ffffffff80396099>] ? device_del+0x150/0x178
Oct  7 12:50:15 localhost kernel:  [<ffffffffa0074f2e>] ? __scsi_remove_device+0x3a/0x7a [scsi_mod]
Oct  7 12:50:15 localhost kernel:  [<ffffffffa0074f8f>] ? scsi_remove_device+0x21/0x2e [scsi_mod]
Oct  7 12:50:15 localhost kernel:  [<ffffffffa0075021>] ? __scsi_remove_target+0x85/0xc3 [scsi_mod]
Oct  7 12:50:15 localhost kernel:  [<ffffffffa00750a4>] ? __remove_child+0x0/0x1a [scsi_mod]
Oct  7 12:50:16 localhost kernel:  [<ffffffffa00750ba>] ? __remove_child+0x16/0x1a [scsi_mod]
Oct  7 12:50:16 localhost kernel:  [<ffffffff80395955>] ? device_for_each_child+0x22/0x4d
Oct  7 12:50:16 localhost kernel:  [<ffffffffa0075099>] ? scsi_remove_target+0x3a/0x45 [scsi_mod]
Oct  7 12:50:16 localhost kernel:  [<ffffffffa00fd7b9>] ? fc_starget_delete+0x0/0x64 [scsi_transport_fc]
Oct  7 12:50:16 localhost kernel:  [<ffffffff802439e4>] ? run_workqueue+0x7a/0x102
Oct  7 12:50:16 localhost kernel:  [<ffffffff802442dc>] ? worker_thread+0xd5/0xe0
Oct  7 12:50:16 localhost kernel:  [<ffffffff80246a8e>] ? autoremove_wake_function+0x0/0x2e
Oct  7 12:50:16 localhost kernel:  [<ffffffff80244207>] ? worker_thread+0x0/0xe0
Oct  7 12:50:16 localhost multipathd: mpath2: load table [0 20971520 multipath 0 1 rdac 2 1 round-robin 0 2 1 8:80 300 8:176 300 round-robin 0 1 1 8:224 100]
Oct  7 12:50:16 localhost kernel:  [<ffffffff80246960>] ? kthread+0x47/0x73
Oct  7 12:50:16 localhost kernel:  [<ffffffff80230fc3>] ? schedule_tail+0x28/0x60
Oct  7 12:50:16 localhost kernel:  [<ffffffff8020cd49>] ? child_rip+0xa/0x11
Oct  7 12:50:16 localhost kernel:  [<ffffffff80246919>] ? kthread+0x0/0x73
Oct  7 12:50:16 localhost kernel:  [<ffffffff8020cd3f>] ? child_rip+0x0/0x11
Oct  7 12:50:16 localhost kernel:
Oct  7 12:50:16 localhost kernel:
Oct  7 12:50:16 localhost kernel: Code: a0 31 c0 e8 8e 80 ea df c6 43 0d 02 eb 08 a8 01 74 04 c6 43 0d 01 5b 5d 44 89 e0 41 5c c3 55 53 48 89 fb 48 83 ec 08 48 8b 47 08 <48> 8b b8 38 02 00 00 e8 e0 30 0f e0 48 89 c6 48 8b 43 08 48 8b
Oct  7 12:50:16 localhost kernel: RIP  [<ffffffffa038e283>] rdac_bus_detach+0xd/0x9a [scsi_dh_rdac]
Oct  7 12:50:16 localhost kernel:  RSP <ffff88007d4b3cd0>
Oct  7 12:50:16 localhost kernel: CR2: 0000000000000238
Oct  7 12:50:16 localhost kernel: ---[ end trace 9be4f9e6c2a759cf ]---

-------------------------------------------------------------------------

Your patch did not compile straight away (on scsi_dh.c). I had to do some changes. I did few changes with the knowledge I had. Correct me if there is anything wrong.
Here is the patch.


--- scsi_dh.c   2008-10-07 10:25:40.000000000 -0500
+++ linux-2.6.27-rc7-babu/drivers/scsi/device_handler/scsi_dh.c 2008-10-07 12:13:19.000000000 -0500
@@ -148,17 +148,35 @@
 static int scsi_dh_handler_attach(struct scsi_device *sdev,
                                  struct scsi_device_handler *scsi_dh)
 {
+
        int err = 0;

-       if (sdev->scsi_dh_data) {
+       if (sdev->scsi_dh_data){
                if (sdev->scsi_dh_data->scsi_dh != scsi_dh)
                        err = -EBUSY;
-       } else if (scsi_dh->attach)
-               err = scsi_dh->attach(sdev);
+               else
+                       kref_get (&sdev->scsi_dh_data->kref);
+       } else if (scsi_dh->attach){
+               err = scsi_dh->attach (sdev);
+               if (!err)
+                       kref_init (&sdev->scsi_dh_data->kref);
+       }

        return err;
 }

+static void scsi_dh_release(struct kref *kref)
+{
+       struct scsi_dh_data *scsi_dh_data;
+       struct scsi_device *sdev;
+       scsi_dh_data = container_of(kref, struct scsi_dh_data, kref);
+       sdev = container_of(&scsi_dh_data, struct scsi_device, scsi_dh_data);
+
+       if (scsi_dh_data->scsi_dh && scsi_dh_data->scsi_dh->detach)
+               scsi_dh_data->scsi_dh->detach(sdev);
+}
+
+
 /*
  * scsi_dh_handler_detach - Detach a device handler from a device
  * @sdev - SCSI device the device handler should be detached from
@@ -175,12 +193,7 @@

        if (scsi_dh && scsi_dh != sdev->scsi_dh_data->scsi_dh)
                return;
-
-       if (!scsi_dh)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-
-       if (scsi_dh && scsi_dh->detach)
-               scsi_dh->detach(sdev);
+       kref_put(&sdev->scsi_dh_data->kref, scsi_dh_release);
 }

 /*



PS:  Yes. You are right. With linux-2.6.27-rc8 sources, I am not seeing the dh_state in sysfs filesystem. That is the reason I reverted back to linux-2.6.27-rc7.

Thanks
Babu Moger

-----Original Message-----
From: Chandra Seetharaman [mailto:sekharan at us.ibm.com]
Sent: Monday, October 06, 2008 9:05 PM
To: Moger, Babu
Cc: device-mapper development; linux-scsi at vger.kernel.org
Subject: RE: [dm-devel] failover does not work with rdac device handler

Hi,

Can you try the attached patch.

chandra
PS: I see a problem (not related to this patch) that the dh_state file
is not getting recreated, still working on it.

-------------

Keep a reference count of attaches, so that same number of detaches are allowed.

Signed-off-by: Chandra Seetharaman <sekharan at us.ibm.com>
---
Index: linux-2.6.27-rc8-git5/drivers/scsi/device_handler/scsi_dh.c
===================================================================
--- linux-2.6.27-rc8-git5.orig/drivers/scsi/device_handler/scsi_dh.c
+++ linux-2.6.27-rc8-git5/drivers/scsi/device_handler/scsi_dh.c
@@ -153,12 +153,26 @@ static int scsi_dh_handler_attach(struct
        if (sdev->scsi_dh_data) {
                if (sdev->scsi_dh_data->scsi_dh != scsi_dh)
                        err = -EBUSY;
-       } else if (scsi_dh->attach)
+               else
+                       kref_get(&sdev->scsi_dh_data.kref);
+       } else if (scsi_dh->attach) {
                err = scsi_dh->attach(sdev);
+               if (!err)
+                       kref_init(&sdev->scsi_dh_data.kref);
+       }

        return err;
 }

+static void scsi_dh_release(struct *kref kref)
+{
+       struct scsi_dh_data *scsi_dh_data;
+       scsi_dh_data = container_of(kref, struct scsi_dh_data, kref);
+
+       if (scsi_dh_data->scsi_dh && scsi_dh_data->scsi_dh->detach)
+               scsi_dh_data->scsi_dh->detach(sdev);
+}
+
 /*
  * scsi_dh_handler_detach - Detach a device handler from a device
  * @sdev - SCSI device the device handler should be detached from
@@ -176,11 +190,7 @@ static void scsi_dh_handler_detach(struc
        if (scsi_dh && scsi_dh != sdev->scsi_dh_data->scsi_dh)
                return;

-       if (!scsi_dh)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-
-       if (scsi_dh && scsi_dh->detach)
-               scsi_dh->detach(sdev);
+       kref_put(&sdev->scsi_dh_data.kref, scsi_dh_release);
 }

 /*
Index: linux-2.6.27-rc8-git5/include/scsi/scsi_device.h
===================================================================
--- linux-2.6.27-rc8-git5.orig/include/scsi/scsi_device.h
+++ linux-2.6.27-rc8-git5/include/scsi/scsi_device.h
@@ -191,6 +191,7 @@ struct scsi_device_handler {

 struct scsi_dh_data {
        struct scsi_device_handler *scsi_dh;
+       struct kref kref;
        char buf[0];
 };







More information about the dm-devel mailing list