diff mbox series

[v4] ceph: fix slab-use-after-free in have_mon_and_osd_map()

Message ID 20250305194437.59309-1-slava@dubeyko.com
State New
Headers show
Series [v4] ceph: fix slab-use-after-free in have_mon_and_osd_map() | expand

Commit Message

Viacheslav Dubeyko March 5, 2025, 7:44 p.m. UTC
From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>

The generic/395 and generic/397 is capable of generating
the oops is on line net/ceph/ceph_common.c:794 with
KASAN enabled.

BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305

CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
Call Trace:
<TASK>
dump_stack_lvl+0x57/0x80
? have_mon_and_osd_map+0x56/0x70
print_address_description.constprop.0+0x84/0x330
? have_mon_and_osd_map+0x56/0x70
print_report+0xe2/0x1e0
? rcu_read_unlock_sched+0x60/0x80
? kmem_cache_debug_flags+0xc/0x20
? fixup_red_left+0x17/0x30
? have_mon_and_osd_map+0x56/0x70
kasan_report+0x8d/0xc0
? have_mon_and_osd_map+0x56/0x70
have_mon_and_osd_map+0x56/0x70
ceph_open_session+0x182/0x290
? __pfx_ceph_open_session+0x10/0x10
? __init_swait_queue_head+0x8d/0xa0
? __pfx_autoremove_wake_function+0x10/0x10
? shrinker_register+0xdd/0xf0
ceph_get_tree+0x333/0x680
vfs_get_tree+0x49/0x180
do_new_mount+0x1a3/0x2d0
? __pfx_do_new_mount+0x10/0x10
? security_capable+0x39/0x70
path_mount+0x6dd/0x730
? __pfx_path_mount+0x10/0x10
? kmem_cache_free+0x1e5/0x270
? user_path_at+0x48/0x60
do_mount+0x99/0xe0
? __pfx_do_mount+0x10/0x10
? lock_release+0x155/0x190
__do_sys_mount+0x141/0x180
do_syscall_64+0x9f/0x100
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f01b1b14f3e
Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
</TASK>

Allocated by task 13305:
stack_trace_save+0x8c/0xc0
kasan_save_stack+0x1e/0x40
kasan_save_track+0x10/0x30
__kasan_kmalloc+0x3a/0x50
__kmalloc_noprof+0x247/0x290
ceph_osdmap_alloc+0x16/0x130
ceph_osdc_init+0x27a/0x4c0
ceph_create_client+0x153/0x190
create_fs_client+0x50/0x2a0
ceph_get_tree+0xff/0x680
vfs_get_tree+0x49/0x180
do_new_mount+0x1a3/0x2d0
path_mount+0x6dd/0x730
do_mount+0x99/0xe0
__do_sys_mount+0x141/0x180
do_syscall_64+0x9f/0x100
entry_SYSCALL_64_after_hwframe+0x76/0x7e

Freed by task 9475:
stack_trace_save+0x8c/0xc0
kasan_save_stack+0x1e/0x40
kasan_save_track+0x10/0x30
kasan_save_free_info+0x3b/0x50
__kasan_slab_free+0x18/0x30
kfree+0x212/0x290
handle_one_map+0x23c/0x3b0
ceph_osdc_handle_map+0x3c9/0x590
mon_dispatch+0x655/0x6f0
ceph_con_process_message+0xc3/0xe0
ceph_con_v1_try_read+0x614/0x760
ceph_con_workfn+0x2de/0x650
process_one_work+0x486/0x7c0
process_scheduled_works+0x73/0x90
worker_thread+0x1c8/0x2a0
kthread+0x2ec/0x300
ret_from_fork+0x24/0x40
ret_from_fork_asm+0x1a/0x30

The buggy address belongs to the object at ffff88811012d800
which belongs to the cache kmalloc-512 of size 512
The buggy address is located 16 bytes inside of
freed 512-byte region [ffff88811012d800, ffff88811012da00)

The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c
head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
flags: 0x200000000000040(head|node=0|zone=2)
page_type: f5(slab)
raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000
head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

    ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

^
ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ==================================================================
Disabling lock debugging due to kernel taint
libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
libceph: mon0 (1)90.155.74.19:6789 session established
libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc

We have such scenario:

Thread 1:
void ceph_osdmap_destroy(...) {
    <skipped>
    kfree(map);
}
Thread 1 sleep...

Thread 2:
static bool have_mon_and_osd_map(struct ceph_client *client) {
    return client->monc.monmap && client->monc.monmap->epoch &&
        client->osdc.osdmap && client->osdc.osdmap->epoch;
}
Thread 2 has oops...

Thread 1 wake up:
static int handle_one_map(...) {
    <skipped>
    osdc->osdmap = newmap;
    <skipped>
}

This patch fixes the issue by means of locking
client->osdc.lock and client->monc.mutex before
the checking client->osdc.osdmap and
client->monc.monmap in have_mon_and_osd_map() function.
Patch adds locking in the ceph_osdc_stop()
method during the destructruction of osdc->osdmap and
assigning of NULL to the pointer. The lock is used
in the ceph_monc_stop() during the freeing of monc->monmap
and assigning NULL to the pointer too. The monmap_show()
and osdmap_show() methods were reworked to prevent
the potential race condition during the methods call.

Reported-by: David Howells <dhowells@redhat.com>
Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
---
 net/ceph/ceph_common.c | 14 ++++++++++++--
 net/ceph/debugfs.c     | 33 +++++++++++++++++++--------------
 net/ceph/mon_client.c  |  9 ++++++++-
 net/ceph/osd_client.c  |  4 ++++
 4 files changed, 43 insertions(+), 17 deletions(-)

Comments

David Howells March 10, 2025, 2 p.m. UTC | #1
Viacheslav Dubeyko <slava@dubeyko.com> wrote:

> The generic/395 and generic/397 is capable of generating
> the oops is on line net/ceph/ceph_common.c:794 with
> KASAN enabled.
> 
> BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
> Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305
> ...
> This patch fixes the issue by means of locking
> client->osdc.lock and client->monc.mutex before
> the checking client->osdc.osdmap and
> client->monc.monmap in have_mon_and_osd_map() function.
> Patch adds locking in the ceph_osdc_stop()
> method during the destructruction of osdc->osdmap and
> assigning of NULL to the pointer. The lock is used
> in the ceph_monc_stop() during the freeing of monc->monmap
> and assigning NULL to the pointer too. The monmap_show()
> and osdmap_show() methods were reworked to prevent
> the potential race condition during the methods call.
> 
> Reported-by: David Howells <dhowells@redhat.com>
> Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>

Reviewed-by: David Howells <dhowells@redhat.com>
Ilya Dryomov April 8, 2025, 3:20 p.m. UTC | #2
On Wed, Mar 5, 2025 at 8:44 PM Viacheslav Dubeyko <slava@dubeyko.com> wrote:
>
> From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
>
> The generic/395 and generic/397 is capable of generating
> the oops is on line net/ceph/ceph_common.c:794 with
> KASAN enabled.
>
> BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
> Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305
>
> CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266
> Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x57/0x80
> ? have_mon_and_osd_map+0x56/0x70
> print_address_description.constprop.0+0x84/0x330
> ? have_mon_and_osd_map+0x56/0x70
> print_report+0xe2/0x1e0
> ? rcu_read_unlock_sched+0x60/0x80
> ? kmem_cache_debug_flags+0xc/0x20
> ? fixup_red_left+0x17/0x30
> ? have_mon_and_osd_map+0x56/0x70
> kasan_report+0x8d/0xc0
> ? have_mon_and_osd_map+0x56/0x70
> have_mon_and_osd_map+0x56/0x70
> ceph_open_session+0x182/0x290
> ? __pfx_ceph_open_session+0x10/0x10
> ? __init_swait_queue_head+0x8d/0xa0
> ? __pfx_autoremove_wake_function+0x10/0x10
> ? shrinker_register+0xdd/0xf0
> ceph_get_tree+0x333/0x680
> vfs_get_tree+0x49/0x180
> do_new_mount+0x1a3/0x2d0
> ? __pfx_do_new_mount+0x10/0x10
> ? security_capable+0x39/0x70
> path_mount+0x6dd/0x730
> ? __pfx_path_mount+0x10/0x10
> ? kmem_cache_free+0x1e5/0x270
> ? user_path_at+0x48/0x60
> do_mount+0x99/0xe0
> ? __pfx_do_mount+0x10/0x10
> ? lock_release+0x155/0x190
> __do_sys_mount+0x141/0x180
> do_syscall_64+0x9f/0x100
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> RIP: 0033:0x7f01b1b14f3e
> Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
> RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
> RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
> RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
> RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
> R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
> R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
> </TASK>
>
> Allocated by task 13305:
> stack_trace_save+0x8c/0xc0
> kasan_save_stack+0x1e/0x40
> kasan_save_track+0x10/0x30
> __kasan_kmalloc+0x3a/0x50
> __kmalloc_noprof+0x247/0x290
> ceph_osdmap_alloc+0x16/0x130
> ceph_osdc_init+0x27a/0x4c0
> ceph_create_client+0x153/0x190
> create_fs_client+0x50/0x2a0
> ceph_get_tree+0xff/0x680
> vfs_get_tree+0x49/0x180
> do_new_mount+0x1a3/0x2d0
> path_mount+0x6dd/0x730
> do_mount+0x99/0xe0
> __do_sys_mount+0x141/0x180
> do_syscall_64+0x9f/0x100
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Freed by task 9475:
> stack_trace_save+0x8c/0xc0
> kasan_save_stack+0x1e/0x40
> kasan_save_track+0x10/0x30
> kasan_save_free_info+0x3b/0x50
> __kasan_slab_free+0x18/0x30
> kfree+0x212/0x290
> handle_one_map+0x23c/0x3b0
> ceph_osdc_handle_map+0x3c9/0x590
> mon_dispatch+0x655/0x6f0
> ceph_con_process_message+0xc3/0xe0
> ceph_con_v1_try_read+0x614/0x760
> ceph_con_workfn+0x2de/0x650
> process_one_work+0x486/0x7c0
> process_scheduled_works+0x73/0x90
> worker_thread+0x1c8/0x2a0
> kthread+0x2ec/0x300
> ret_from_fork+0x24/0x40
> ret_from_fork_asm+0x1a/0x30
>
> The buggy address belongs to the object at ffff88811012d800
> which belongs to the cache kmalloc-512 of size 512
> The buggy address is located 16 bytes inside of
> freed 512-byte region [ffff88811012d800, ffff88811012da00)
>
> The buggy address belongs to the physical page:
> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c
> head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> flags: 0x200000000000040(head|node=0|zone=2)
> page_type: f5(slab)
> raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
> raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
> head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
> head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
> head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000
> head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
> ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>
>     ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>
> ^
> ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ==================================================================
> Disabling lock debugging due to kernel taint
> libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> libceph: mon0 (1)90.155.74.19:6789 session established
> libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc
>
> We have such scenario:
>
> Thread 1:
> void ceph_osdmap_destroy(...) {
>     <skipped>
>     kfree(map);
> }
> Thread 1 sleep...
>
> Thread 2:
> static bool have_mon_and_osd_map(struct ceph_client *client) {
>     return client->monc.monmap && client->monc.monmap->epoch &&
>         client->osdc.osdmap && client->osdc.osdmap->epoch;
> }
> Thread 2 has oops...
>
> Thread 1 wake up:
> static int handle_one_map(...) {
>     <skipped>
>     osdc->osdmap = newmap;
>     <skipped>
> }
>
> This patch fixes the issue by means of locking
> client->osdc.lock and client->monc.mutex before
> the checking client->osdc.osdmap and
> client->monc.monmap in have_mon_and_osd_map() function.
> Patch adds locking in the ceph_osdc_stop()
> method during the destructruction of osdc->osdmap and
> assigning of NULL to the pointer. The lock is used
> in the ceph_monc_stop() during the freeing of monc->monmap
> and assigning NULL to the pointer too. The monmap_show()
> and osdmap_show() methods were reworked to prevent
> the potential race condition during the methods call.
>
> Reported-by: David Howells <dhowells@redhat.com>
> Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> ---
>  net/ceph/ceph_common.c | 14 ++++++++++++--
>  net/ceph/debugfs.c     | 33 +++++++++++++++++++--------------
>  net/ceph/mon_client.c  |  9 ++++++++-
>  net/ceph/osd_client.c  |  4 ++++
>  4 files changed, 43 insertions(+), 17 deletions(-)
>
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 4c6441536d55..5c8fd78d6bd5 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -790,8 +790,18 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
>   */
>  static bool have_mon_and_osd_map(struct ceph_client *client)
>  {
> -       return client->monc.monmap && client->monc.monmap->epoch &&
> -              client->osdc.osdmap && client->osdc.osdmap->epoch;
> +       bool have_mon_map = false;
> +       bool have_osd_map = false;
> +
> +       mutex_lock(&client->monc.mutex);

Hi Slava,

This introduces a potential sleep into a function that is used as
a wait_event condition in __ceph_open_session() which could result in
a nested sleep.  This is frowned up by the scheduler because the task
state may get reset to TASK_RUNNING when not desired:

  do not call blocking ops when !TASK_RUNNING; state=1 set at
[<ffffffff819c19fc>] prepare_to_wait_event+0x3ac/0x460
kernel/sched/wait.c:298

You would probably need to open-code waiting in __ceph_open_session()
per [1].

> +       have_mon_map = client->monc.monmap && client->monc.monmap->epoch;
> +       mutex_unlock(&client->monc.mutex);
> +
> +       down_read(&client->osdc.lock);
> +       have_osd_map = client->osdc.osdmap && client->osdc.osdmap->epoch;
> +       up_read(&client->osdc.lock);
> +
> +       return have_mon_map && have_osd_map;
>  }
>
>  /*
> diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> index 2110439f8a24..6e2014c813ca 100644
> --- a/net/ceph/debugfs.c
> +++ b/net/ceph/debugfs.c
> @@ -36,18 +36,20 @@ static int monmap_show(struct seq_file *s, void *p)
>         int i;
>         struct ceph_client *client = s->private;
>
> -       if (client->monc.monmap == NULL)
> -               return 0;
> -
> -       seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
> -       for (i = 0; i < client->monc.monmap->num_mon; i++) {
> -               struct ceph_entity_inst *inst =
> -                       &client->monc.monmap->mon_inst[i];
> -
> -               seq_printf(s, "\t%s%lld\t%s\n",
> -                          ENTITY_NAME(inst->name),
> -                          ceph_pr_addr(&inst->addr));
> +       mutex_lock(&client->monc.mutex);
> +       if (client->monc.monmap) {
> +               seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
> +               for (i = 0; i < client->monc.monmap->num_mon; i++) {
> +                       struct ceph_entity_inst *inst =
> +                               &client->monc.monmap->mon_inst[i];
> +
> +                       seq_printf(s, "\t%s%lld\t%s\n",
> +                                  ENTITY_NAME(inst->name),
> +                                  ceph_pr_addr(&inst->addr));
> +               }
>         }
> +       mutex_unlock(&client->monc.mutex);

Nit: I'd prefer

        mutex_lock(&client->monc.mutex);
        if (client->monc.monmap == NULL)
                goto out_unlock;

        ...

out_unlock:
        mutex_unlock(&client->monc.mutex);
        return 0;

to an additional level of indentation here.

> +
>         return 0;
>  }
>
> @@ -56,13 +58,15 @@ static int osdmap_show(struct seq_file *s, void *p)
>         int i;
>         struct ceph_client *client = s->private;
>         struct ceph_osd_client *osdc = &client->osdc;
> -       struct ceph_osdmap *map = osdc->osdmap;
> +       struct ceph_osdmap *map = NULL;
>         struct rb_node *n;
>
> +       down_read(&osdc->lock);
> +
> +       map = osdc->osdmap;
>         if (map == NULL)
> -               return 0;
> +               goto finish_osdmap_show;
>
> -       down_read(&osdc->lock);
>         seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
>                         osdc->epoch_barrier, map->flags);
>
> @@ -131,6 +135,7 @@ static int osdmap_show(struct seq_file *s, void *p)
>                 seq_printf(s, "]\n");
>         }
>
> +finish_osdmap_show:

Nit: I'd rename this label to out_unlock so that name communicates what
the label is for.

>         up_read(&osdc->lock);
>         return 0;
>  }
> diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> index ab66b599ac47..b299e5bbddb1 100644
> --- a/net/ceph/mon_client.c
> +++ b/net/ceph/mon_client.c
> @@ -1232,6 +1232,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
>         ceph_auth_destroy(monc->auth);
>  out_monmap:
>         kfree(monc->monmap);
> +       monc->monmap = NULL;
>  out:
>         return err;
>  }
> @@ -1239,6 +1240,8 @@ EXPORT_SYMBOL(ceph_monc_init);
>
>  void ceph_monc_stop(struct ceph_mon_client *monc)
>  {
> +       struct ceph_monmap *old_monmap;
> +
>         dout("stop\n");
>
>         mutex_lock(&monc->mutex);
> @@ -1266,7 +1269,11 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
>         ceph_msg_put(monc->m_subscribe);
>         ceph_msg_put(monc->m_subscribe_ack);
>
> -       kfree(monc->monmap);
> +       mutex_lock(&monc->mutex);

Did you have a specific reason for adding locking here and in
ceph_osdc_stop()?  I see that David inquired about this in a comment
on v2 and I think the answer to his question is no.  By this point we
are long past the point of no return and no new messages can arrive for
e.g. handle_one_map() to be called.

[1] https://lwn.net/Articles/628628/

Thanks,

                Ilya
Viacheslav Dubeyko April 8, 2025, 7:39 p.m. UTC | #3
On Tue, 2025-04-08 at 17:20 +0200, Ilya Dryomov wrote:
> On Wed, Mar 5, 2025 at 8:44 PM Viacheslav Dubeyko <slava@dubeyko.com> wrote:
> > 
> > From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > 
> > The generic/395 and generic/397 is capable of generating
> > the oops is on line net/ceph/ceph_common.c:794 with
> > KASAN enabled.
> > 
> > BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
> > Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305
> > 
> > CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266
> > Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
> > Call Trace:
> > <TASK>
> > dump_stack_lvl+0x57/0x80
> > ? have_mon_and_osd_map+0x56/0x70
> > print_address_description.constprop.0+0x84/0x330
> > ? have_mon_and_osd_map+0x56/0x70
> > print_report+0xe2/0x1e0
> > ? rcu_read_unlock_sched+0x60/0x80
> > ? kmem_cache_debug_flags+0xc/0x20
> > ? fixup_red_left+0x17/0x30
> > ? have_mon_and_osd_map+0x56/0x70
> > kasan_report+0x8d/0xc0
> > ? have_mon_and_osd_map+0x56/0x70
> > have_mon_and_osd_map+0x56/0x70
> > ceph_open_session+0x182/0x290
> > ? __pfx_ceph_open_session+0x10/0x10
> > ? __init_swait_queue_head+0x8d/0xa0
> > ? __pfx_autoremove_wake_function+0x10/0x10
> > ? shrinker_register+0xdd/0xf0
> > ceph_get_tree+0x333/0x680
> > vfs_get_tree+0x49/0x180
> > do_new_mount+0x1a3/0x2d0
> > ? __pfx_do_new_mount+0x10/0x10
> > ? security_capable+0x39/0x70
> > path_mount+0x6dd/0x730
> > ? __pfx_path_mount+0x10/0x10
> > ? kmem_cache_free+0x1e5/0x270
> > ? user_path_at+0x48/0x60
> > do_mount+0x99/0xe0
> > ? __pfx_do_mount+0x10/0x10
> > ? lock_release+0x155/0x190
> > __do_sys_mount+0x141/0x180
> > do_syscall_64+0x9f/0x100
> > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > RIP: 0033:0x7f01b1b14f3e
> > Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
> > RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
> > RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
> > RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
> > RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
> > R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
> > R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
> > </TASK>
> > 
> > Allocated by task 13305:
> > stack_trace_save+0x8c/0xc0
> > kasan_save_stack+0x1e/0x40
> > kasan_save_track+0x10/0x30
> > __kasan_kmalloc+0x3a/0x50
> > __kmalloc_noprof+0x247/0x290
> > ceph_osdmap_alloc+0x16/0x130
> > ceph_osdc_init+0x27a/0x4c0
> > ceph_create_client+0x153/0x190
> > create_fs_client+0x50/0x2a0
> > ceph_get_tree+0xff/0x680
> > vfs_get_tree+0x49/0x180
> > do_new_mount+0x1a3/0x2d0
> > path_mount+0x6dd/0x730
> > do_mount+0x99/0xe0
> > __do_sys_mount+0x141/0x180
> > do_syscall_64+0x9f/0x100
> > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > 
> > Freed by task 9475:
> > stack_trace_save+0x8c/0xc0
> > kasan_save_stack+0x1e/0x40
> > kasan_save_track+0x10/0x30
> > kasan_save_free_info+0x3b/0x50
> > __kasan_slab_free+0x18/0x30
> > kfree+0x212/0x290
> > handle_one_map+0x23c/0x3b0
> > ceph_osdc_handle_map+0x3c9/0x590
> > mon_dispatch+0x655/0x6f0
> > ceph_con_process_message+0xc3/0xe0
> > ceph_con_v1_try_read+0x614/0x760
> > ceph_con_workfn+0x2de/0x650
> > process_one_work+0x486/0x7c0
> > process_scheduled_works+0x73/0x90
> > worker_thread+0x1c8/0x2a0
> > kthread+0x2ec/0x300
> > ret_from_fork+0x24/0x40
> > ret_from_fork_asm+0x1a/0x30
> > 
> > The buggy address belongs to the object at ffff88811012d800
> > which belongs to the cache kmalloc-512 of size 512
> > The buggy address is located 16 bytes inside of
> > freed 512-byte region [ffff88811012d800, ffff88811012da00)
> > 
> > The buggy address belongs to the physical page:
> > page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c
> > head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> > flags: 0x200000000000040(head|node=0|zone=2)
> > page_type: f5(slab)
> > raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
> > raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
> > head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
> > head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
> > head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000
> > head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000
> > page dumped because: kasan: bad access detected
> > 
> > Memory state around the buggy address:
> > ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > 
> >     ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > 
> > ^
> > ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ==================================================================
> > Disabling lock debugging due to kernel taint
> > libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > libceph: mon0 (1)90.155.74.19:6789 session established
> > libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > 
> > We have such scenario:
> > 
> > Thread 1:
> > void ceph_osdmap_destroy(...) {
> >     <skipped>
> >     kfree(map);
> > }
> > Thread 1 sleep...
> > 
> > Thread 2:
> > static bool have_mon_and_osd_map(struct ceph_client *client) {
> >     return client->monc.monmap && client->monc.monmap->epoch &&
> >         client->osdc.osdmap && client->osdc.osdmap->epoch;
> > }
> > Thread 2 has oops...
> > 
> > Thread 1 wake up:
> > static int handle_one_map(...) {
> >     <skipped>
> >     osdc->osdmap = newmap;
> >     <skipped>
> > }
> > 
> > This patch fixes the issue by means of locking
> > client->osdc.lock and client->monc.mutex before
> > the checking client->osdc.osdmap and
> > client->monc.monmap in have_mon_and_osd_map() function.
> > Patch adds locking in the ceph_osdc_stop()
> > method during the destructruction of osdc->osdmap and
> > assigning of NULL to the pointer. The lock is used
> > in the ceph_monc_stop() during the freeing of monc->monmap
> > and assigning NULL to the pointer too. The monmap_show()
> > and osdmap_show() methods were reworked to prevent
> > the potential race condition during the methods call.
> > 
> > Reported-by: David Howells <dhowells@redhat.com>
> > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > ---
> >  net/ceph/ceph_common.c | 14 ++++++++++++--
> >  net/ceph/debugfs.c     | 33 +++++++++++++++++++--------------
> >  net/ceph/mon_client.c  |  9 ++++++++-
> >  net/ceph/osd_client.c  |  4 ++++
> >  4 files changed, 43 insertions(+), 17 deletions(-)
> > 
> > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> > index 4c6441536d55..5c8fd78d6bd5 100644
> > --- a/net/ceph/ceph_common.c
> > +++ b/net/ceph/ceph_common.c
> > @@ -790,8 +790,18 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
> >   */
> >  static bool have_mon_and_osd_map(struct ceph_client *client)
> >  {
> > -       return client->monc.monmap && client->monc.monmap->epoch &&
> > -              client->osdc.osdmap && client->osdc.osdmap->epoch;
> > +       bool have_mon_map = false;
> > +       bool have_osd_map = false;
> > +
> > +       mutex_lock(&client->monc.mutex);
> 
> Hi Slava,
> 
> This introduces a potential sleep into a function that is used as
> a wait_event condition in __ceph_open_session() which could result in
> a nested sleep.  This is frowned up by the scheduler because the task
> state may get reset to TASK_RUNNING when not desired:
> 
>   do not call blocking ops when !TASK_RUNNING; state=1 set at
> [<ffffffff819c19fc>] prepare_to_wait_event+0x3ac/0x460
> kernel/sched/wait.c:298
> 

Yeah, I missed the call of have_mon_and_osd_map() here:

		err = wait_event_interruptible_timeout(client->auth_wq,
			have_mon_and_osd_map(client) || (client->auth_err < 0),
			ceph_timeout_jiffies(timeout));


> You would probably need to open-code waiting in __ceph_open_session()
> per [1].
> 

Maybe, we need to rework the waiting approach here? Could we check some
condition that it doesn't require to use the locking at all? Maybe, we can use
some atomic_t variable that will summarize the state?

> > +       have_mon_map = client->monc.monmap && client->monc.monmap->epoch;
> > +       mutex_unlock(&client->monc.mutex);
> > +
> > +       down_read(&client->osdc.lock);
> > +       have_osd_map = client->osdc.osdmap && client->osdc.osdmap->epoch;
> > +       up_read(&client->osdc.lock);
> > +
> > +       return have_mon_map && have_osd_map;
> >  }
> > 
> >  /*
> > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> > index 2110439f8a24..6e2014c813ca 100644
> > --- a/net/ceph/debugfs.c
> > +++ b/net/ceph/debugfs.c
> > @@ -36,18 +36,20 @@ static int monmap_show(struct seq_file *s, void *p)
> >         int i;
> >         struct ceph_client *client = s->private;
> > 
> > -       if (client->monc.monmap == NULL)
> > -               return 0;
> > -
> > -       seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
> > -       for (i = 0; i < client->monc.monmap->num_mon; i++) {
> > -               struct ceph_entity_inst *inst =
> > -                       &client->monc.monmap->mon_inst[i];
> > -
> > -               seq_printf(s, "\t%s%lld\t%s\n",
> > -                          ENTITY_NAME(inst->name),
> > -                          ceph_pr_addr(&inst->addr));
> > +       mutex_lock(&client->monc.mutex);
> > +       if (client->monc.monmap) {
> > +               seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
> > +               for (i = 0; i < client->monc.monmap->num_mon; i++) {
> > +                       struct ceph_entity_inst *inst =
> > +                               &client->monc.monmap->mon_inst[i];
> > +
> > +                       seq_printf(s, "\t%s%lld\t%s\n",
> > +                                  ENTITY_NAME(inst->name),
> > +                                  ceph_pr_addr(&inst->addr));
> > +               }
> >         }
> > +       mutex_unlock(&client->monc.mutex);
> 
> Nit: I'd prefer
> 
>         mutex_lock(&client->monc.mutex);
>         if (client->monc.monmap == NULL)
>                 goto out_unlock;
> 
>         ...
> 
> out_unlock:
>         mutex_unlock(&client->monc.mutex);
>         return 0;
> 
> to an additional level of indentation here.
> 

OK, I can do this.

> > +
> >         return 0;
> >  }
> > 
> > @@ -56,13 +58,15 @@ static int osdmap_show(struct seq_file *s, void *p)
> >         int i;
> >         struct ceph_client *client = s->private;
> >         struct ceph_osd_client *osdc = &client->osdc;
> > -       struct ceph_osdmap *map = osdc->osdmap;
> > +       struct ceph_osdmap *map = NULL;
> >         struct rb_node *n;
> > 
> > +       down_read(&osdc->lock);
> > +
> > +       map = osdc->osdmap;
> >         if (map == NULL)
> > -               return 0;
> > +               goto finish_osdmap_show;
> > 
> > -       down_read(&osdc->lock);
> >         seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
> >                         osdc->epoch_barrier, map->flags);
> > 
> > @@ -131,6 +135,7 @@ static int osdmap_show(struct seq_file *s, void *p)
> >                 seq_printf(s, "]\n");
> >         }
> > 
> > +finish_osdmap_show:
> 
> Nit: I'd rename this label to out_unlock so that name communicates what
> the label is for.
> 

OK, sounds good.

> >         up_read(&osdc->lock);
> >         return 0;
> >  }
> > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> > index ab66b599ac47..b299e5bbddb1 100644
> > --- a/net/ceph/mon_client.c
> > +++ b/net/ceph/mon_client.c
> > @@ -1232,6 +1232,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
> >         ceph_auth_destroy(monc->auth);
> >  out_monmap:
> >         kfree(monc->monmap);
> > +       monc->monmap = NULL;
> >  out:
> >         return err;
> >  }
> > @@ -1239,6 +1240,8 @@ EXPORT_SYMBOL(ceph_monc_init);
> > 
> >  void ceph_monc_stop(struct ceph_mon_client *monc)
> >  {
> > +       struct ceph_monmap *old_monmap;
> > +
> >         dout("stop\n");
> > 
> >         mutex_lock(&monc->mutex);
> > @@ -1266,7 +1269,11 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
> >         ceph_msg_put(monc->m_subscribe);
> >         ceph_msg_put(monc->m_subscribe_ack);
> > 
> > -       kfree(monc->monmap);
> > +       mutex_lock(&monc->mutex);
> 
> Did you have a specific reason for adding locking here and in
> ceph_osdc_stop()?  I see that David inquired about this in a comment
> on v2 and I think the answer to his question is no.
> 

I assume that you mean this:

>>  Is
>> that a "just in case" thing?  It doesn't look like the client can get
>> resurrected afterwards, but I may have missed something.  If it's not just in
>> case, does the access and clearance of the pointers need wrapping in the
>> appropriate lock?
>>
>
>Let me double check it. Probably, you are right here.

My understanding of this comment was that we need to be more accurate with
pointers access and it needs to add the locking during the destruction. So, I
promised to double check it and I shared v.3 of the patch. It was my vision of
the world for that moment. :)

>   By this point we
> are long past the point of no return and no new messages can arrive for
> e.g. handle_one_map() to be called.
> 

My idea was simple. Some threads can potentially try to open session during or
after ceph_monc_stop() and ceph_osdc_stop() calls. Do you mean we can guarantee
that such situation will never happen?

Thanks,
Slava.
diff mbox series

Patch

diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4c6441536d55..5c8fd78d6bd5 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -790,8 +790,18 @@  EXPORT_SYMBOL(ceph_reset_client_addr);
  */
 static bool have_mon_and_osd_map(struct ceph_client *client)
 {
-	return client->monc.monmap && client->monc.monmap->epoch &&
-	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+	bool have_mon_map = false;
+	bool have_osd_map = false;
+
+	mutex_lock(&client->monc.mutex);
+	have_mon_map = client->monc.monmap && client->monc.monmap->epoch;
+	mutex_unlock(&client->monc.mutex);
+
+	down_read(&client->osdc.lock);
+	have_osd_map = client->osdc.osdmap && client->osdc.osdmap->epoch;
+	up_read(&client->osdc.lock);
+
+	return have_mon_map && have_osd_map;
 }
 
 /*
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 2110439f8a24..6e2014c813ca 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -36,18 +36,20 @@  static int monmap_show(struct seq_file *s, void *p)
 	int i;
 	struct ceph_client *client = s->private;
 
-	if (client->monc.monmap == NULL)
-		return 0;
-
-	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-	for (i = 0; i < client->monc.monmap->num_mon; i++) {
-		struct ceph_entity_inst *inst =
-			&client->monc.monmap->mon_inst[i];
-
-		seq_printf(s, "\t%s%lld\t%s\n",
-			   ENTITY_NAME(inst->name),
-			   ceph_pr_addr(&inst->addr));
+	mutex_lock(&client->monc.mutex);
+	if (client->monc.monmap) {
+		seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+		for (i = 0; i < client->monc.monmap->num_mon; i++) {
+			struct ceph_entity_inst *inst =
+				&client->monc.monmap->mon_inst[i];
+
+			seq_printf(s, "\t%s%lld\t%s\n",
+				   ENTITY_NAME(inst->name),
+				   ceph_pr_addr(&inst->addr));
+		}
 	}
+	mutex_unlock(&client->monc.mutex);
+
 	return 0;
 }
 
@@ -56,13 +58,15 @@  static int osdmap_show(struct seq_file *s, void *p)
 	int i;
 	struct ceph_client *client = s->private;
 	struct ceph_osd_client *osdc = &client->osdc;
-	struct ceph_osdmap *map = osdc->osdmap;
+	struct ceph_osdmap *map = NULL;
 	struct rb_node *n;
 
+	down_read(&osdc->lock);
+
+	map = osdc->osdmap;
 	if (map == NULL)
-		return 0;
+		goto finish_osdmap_show;
 
-	down_read(&osdc->lock);
 	seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
 			osdc->epoch_barrier, map->flags);
 
@@ -131,6 +135,7 @@  static int osdmap_show(struct seq_file *s, void *p)
 		seq_printf(s, "]\n");
 	}
 
+finish_osdmap_show:
 	up_read(&osdc->lock);
 	return 0;
 }
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ab66b599ac47..b299e5bbddb1 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1232,6 +1232,7 @@  int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	ceph_auth_destroy(monc->auth);
 out_monmap:
 	kfree(monc->monmap);
+	monc->monmap = NULL;
 out:
 	return err;
 }
@@ -1239,6 +1240,8 @@  EXPORT_SYMBOL(ceph_monc_init);
 
 void ceph_monc_stop(struct ceph_mon_client *monc)
 {
+	struct ceph_monmap *old_monmap;
+
 	dout("stop\n");
 
 	mutex_lock(&monc->mutex);
@@ -1266,7 +1269,11 @@  void ceph_monc_stop(struct ceph_mon_client *monc)
 	ceph_msg_put(monc->m_subscribe);
 	ceph_msg_put(monc->m_subscribe_ack);
 
-	kfree(monc->monmap);
+	mutex_lock(&monc->mutex);
+	old_monmap = monc->monmap;
+	monc->monmap = NULL;
+	mutex_unlock(&monc->mutex);
+	kfree(old_monmap);
 }
 EXPORT_SYMBOL(ceph_monc_stop);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b24afec24138..762f4df5763b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5278,6 +5278,7 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	mempool_destroy(osdc->req_mempool);
 out_map:
 	ceph_osdmap_destroy(osdc->osdmap);
+	osdc->osdmap = NULL;
 out:
 	return err;
 }
@@ -5306,10 +5307,13 @@  void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
+	down_write(&osdc->lock);
 	ceph_osdmap_destroy(osdc->osdmap);
+	osdc->osdmap = NULL;
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+	up_write(&osdc->lock);
 }
 
 int osd_req_op_copy_from_init(struct ceph_osd_request *req,