[dm-devel] [PATCH RFC] btrfs: sysfs: add <uuid>/debug/io_accounting/ directory

Qu Wenruo quwenruo.btrfs at gmx.com
Fri Jan 21 09:20:24 UTC 2022



On 2022/1/21 17:14, Anand Jain wrote:
> On 21/01/2022 13:24, Qu Wenruo wrote:
>> [BACKGROUND]
>> There is a series of btrfs autodefrag bugs in v5.16, that would cause
>> way more IO than previous kernels.
>>
>> Unfortunately there isn't any test case covering autodefrag, nor there
>> is anyway to show the io accounting of a btrfs.
>>
>> [WORKAROUND]
>> I originally want to implement a dm target to do the io accounting for
>> all filesystems, but can not find a good enough interface (the status
>> interface has only 3 pre-defined workload).
>>
>> Thus I turned to btrfs specific io accounting first.
>> One thing specific to btrfs is its integrated volume management/RAID.
>>
>> Without proper profiles specification, default profile will cause
>> metadata IO to be accounted twice (DUP profile) and only data IO is
>> accounted correctly.
>>
>> So for btrfs this patch will introduce a new sysfs directory in
>> /sys/fs/btrfs/<uuid>/debug/io_accounting/
>>
>> And have the following files:
>>
>> - meta_read:    Metadata bytes read
>> - meta_write:    Metadata bytes written
>> - data_read:    Data bytes read
>> - data_write:    Data bytes written
>>         (including both zoned append and regular write)
>>
>> And all these accounting is in logical address space, meaning profile
>> will not affect the values.
>>
>> All those values can be reset by simply "echo 0".
>>
>> Signed-off-by: Qu Wenruo <wqu at suse.com>
>> ---
>> Reason for RFC:
>>
>> - (To DM guys) Is there any good way to implement a dm target to do 
>> the IO
>>    accounting?
>>    A more generic one can help more filesystems.
>>
>> - (To Btrfs guys) Is the sysfs interface fine?
> 
>   I am in for it. It can be a non debug feature IMO.
>   More below.
> 
>> ---
>>   fs/btrfs/ctree.h   | 11 +++++++
>>   fs/btrfs/disk-io.c |  1 +
>>   fs/btrfs/sysfs.c   | 77 ++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.c | 24 +++++++++++++++
>>   4 files changed, 113 insertions(+)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index b4a9b1c58d22..3983bceaef7f 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1035,10 +1035,21 @@ struct btrfs_fs_info {
>>   #ifdef CONFIG_BTRFS_DEBUG
>>       struct kobject *debug_kobj;
>>       struct kobject *discard_debug_kobj;
>> +    struct kobject *io_accounting_debug_kobj;
>>       struct list_head allocated_roots;
>>       spinlock_t eb_leak_lock;
>>       struct list_head allocated_ebs;
>> +
>> +    spinlock_t io_accounting_lock;
>> +    /*
>> +     * The IO accounting unit are all in byte, and are in logical 
>> address
>> +     * space, which is before the RAID/DUP mapping.
>> +     */
>> +    u64 meta_read;
>> +    u64 meta_write;
>> +    u64 data_read;
>> +    u64 data_write;
>>   #endif
>>   };
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 87a5addbedf6..41b56fde6e97 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -3065,6 +3065,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info 
>> *fs_info)
>>       INIT_LIST_HEAD(&fs_info->allocated_roots);
>>       INIT_LIST_HEAD(&fs_info->allocated_ebs);
>>       spin_lock_init(&fs_info->eb_leak_lock);
>> +    spin_lock_init(&fs_info->io_accounting_lock);
>>   #endif
>>       extent_map_tree_init(&fs_info->mapping_tree);
>>       btrfs_init_block_rsv(&fs_info->global_block_rsv,
>> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
>> index beb7f72d50b8..dfdef93bdeab 100644
>> --- a/fs/btrfs/sysfs.c
>> +++ b/fs/btrfs/sysfs.c
>> @@ -40,6 +40,7 @@
>>    * btrfs_debug_feature_attrs        /sys/fs/btrfs/debug
>>    * btrfs_debug_mount_attrs        /sys/fs/btrfs/<uuid>/debug
>>    * discard_debug_attrs            /sys/fs/btrfs/<uuid>/debug/discard
>> + * io_accounting_debug_attrs        
>> /sys/fs/btrfs/<uuid>/debug/io_accounting
>>    */
>>   struct btrfs_feature_attr {
>> @@ -616,6 +617,62 @@ static const struct attribute_group 
>> btrfs_debug_feature_attr_group = {
>>       .attrs = btrfs_debug_feature_attrs,
>>   };
>> +/* IO accounting */
>> +#define io_accounting_to_fs_info(_kobj)    
>> to_fs_info((_kobj)->parent->parent)
>> +
>> +#define DECLARE_IO_ACCOUNTING_OPS(name)                    \
>> +static ssize_t io_accounting_##name##_show(struct kobject *kobj,    \
>> +                       struct kobj_attribute *a,    \
>> +                       char *buf)            \
>> +{                                    \
>> +    struct btrfs_fs_info *fs_info = io_accounting_to_fs_info(kobj);    \
>> +    u64 result;                            \
>> +                                    \
>> +    spin_lock(&fs_info->io_accounting_lock);            \
>> +    result = fs_info->name;                        \
>> +    spin_unlock(&fs_info->io_accounting_lock);            \
>> +    return sysfs_emit(buf, "%llu\n", result);            \
>> +}                                    \
>> +static ssize_t io_accounting_##name##_store(struct kobject *kobj,    \
>> +                        struct kobj_attribute *a,    \
>> +                        const char *buf,        \
>> +                        size_t count)         \
>> +{                                    \
>> +    struct btrfs_fs_info *fs_info = io_accounting_to_fs_info(kobj);    \
>> +    u64 value;                            \
>> +    int ret;                            \
>> +                                    \
>> +    ret = kstrtoull(skip_spaces(buf), 0, &value);            \
>> +    if (ret)                            \
>> +        return ret;                        \
>> +    spin_lock(&fs_info->io_accounting_lock);            \
>> +    fs_info->name = value;                        \
>> +    spin_unlock(&fs_info->io_accounting_lock);            \
>> +    return count;                            \
>> +}
>> +
>> +DECLARE_IO_ACCOUNTING_OPS(meta_read);
>> +DECLARE_IO_ACCOUNTING_OPS(meta_write);
>> +DECLARE_IO_ACCOUNTING_OPS(data_read);
>> +DECLARE_IO_ACCOUNTING_OPS(data_write);
>> +
>> +BTRFS_ATTR_RW(io_accounting, meta_read, io_accounting_meta_read_show,
>> +          io_accounting_meta_read_store);
>> +BTRFS_ATTR_RW(io_accounting, meta_write, io_accounting_meta_write_show,
>> +          io_accounting_meta_write_store);
>> +BTRFS_ATTR_RW(io_accounting, data_read, io_accounting_data_read_show,
>> +          io_accounting_data_read_store);
>> +BTRFS_ATTR_RW(io_accounting, data_write, io_accounting_data_write_show,
>> +          io_accounting_data_write_store);
>> +
>> +static const struct attribute *io_accounting_debug_attrs[] = {
>> +    BTRFS_ATTR_PTR(io_accounting, meta_read),
>> +    BTRFS_ATTR_PTR(io_accounting, meta_write),
>> +    BTRFS_ATTR_PTR(io_accounting, data_read),
>> +    BTRFS_ATTR_PTR(io_accounting, data_write),
>> +    NULL,
>> +};
>> +
>>   #endif
>>   static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char 
>> *buf)
>> @@ -1219,6 +1276,12 @@ void btrfs_sysfs_remove_mounted(struct 
>> btrfs_fs_info *fs_info)
>>           kobject_del(fs_info->discard_debug_kobj);
>>           kobject_put(fs_info->discard_debug_kobj);
>>       }
>> +    if (fs_info->io_accounting_debug_kobj) {
>> +        sysfs_remove_files(fs_info->io_accounting_debug_kobj,
>> +                   io_accounting_debug_attrs);
>> +        kobject_del(fs_info->io_accounting_debug_kobj);
>> +        kobject_put(fs_info->io_accounting_debug_kobj);
>> +    }
>>       if (fs_info->debug_kobj) {
>>           sysfs_remove_files(fs_info->debug_kobj, 
>> btrfs_debug_mount_attrs);
>>           kobject_del(fs_info->debug_kobj);
>> @@ -1804,6 +1867,20 @@ int btrfs_sysfs_add_mounted(struct 
>> btrfs_fs_info *fs_info)
>>                      discard_debug_attrs);
>>       if (error)
>>           goto failure;
>> +
>> +    /* io_accounting directory */
>> +    fs_info->io_accounting_debug_kobj =
>> +        kobject_create_and_add("io_accounting",fs_info->debug_kobj);
>> +    if (!fs_info->io_accounting_debug_kobj) {
>> +        error = -ENOMEM;
>> +        goto failure;
>> +    }
>> +
>> +    error = sysfs_create_files(fs_info->io_accounting_debug_kobj,
>> +                   io_accounting_debug_attrs);
>> +    if (error)
>> +        goto failure;
>> +
>>   #endif
>>       error = addrm_unknown_feature_attrs(fs_info, true);
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 2332e3026efa..58f2ec0a611a 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -6763,6 +6763,29 @@ static void bioc_error(struct btrfs_io_context 
>> *bioc, struct bio *bio, u64 logic
>>       }
>>   }
>> +static void update_io_accounting(struct btrfs_fs_info *fs_info, 
>> struct bio *bio)
>> +{
>> +    u32 length = bio->bi_iter.bi_size;
>> +    bool metadata = bio->bi_opf & REQ_META;
>> +
>> +#ifdef    CONFIG_BTRFS_DEBUG
>> +    spin_lock(&fs_info->io_accounting_lock);
>> +    if (bio_op(bio) == REQ_OP_READ) {
>> +        if (metadata)
>> +            fs_info->meta_read += length;
>> +        else
>> +            fs_info->data_read += length;
>> +    } else {
>> +        if (metadata)
>> +            fs_info->meta_write += length;
>> +        else
>> +            fs_info->data_write += length;
>> +    }
>> +    spin_unlock(&fs_info->io_accounting_lock);
>> +#endif
>> +    return;
>> +}
>> +
>>   blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio 
>> *bio,
>>                  int mirror_num)
>>   {
>> @@ -6776,6 +6799,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info 
>> *fs_info, struct bio *bio,
>  >
>>       int total_devs;
>>       struct btrfs_io_context *bioc = NULL;
>> +    update_io_accounting(fs_info, bio);
> 
>   Generally, speaking accounting should go at the end of IO.
>   Other than that looks good.

That is a little more complex than this solution.

At endio time, we no longer have reliable bi_iter->bi_size.

And if we go bio_for_each_segment(), it will go conflicts with recent 
bio split patchset.

Thus I prefer to do the accounting before the IO.

> 
>   Also, need a way to reset the values.

Check io_accounting_##name##_store(), which will reset the value to the 
provided one.

Normally we only need to "echo 0" to reset.

Thanks,
Qu

> 
> Thanks, Anand
> 
>>       length = bio->bi_iter.bi_size;
>>       map_length = length;
> 




More information about the dm-devel mailing list