Message ID | 20250305194437.59309-1-slava@dubeyko.com |
---|---|
State | New |
Headers | show |
Series | [v4] ceph: fix slab-use-after-free in have_mon_and_osd_map() | expand |
Viacheslav Dubeyko <slava@dubeyko.com> wrote: > The generic/395 and generic/397 is capable of generating > the oops is on line net/ceph/ceph_common.c:794 with > KASAN enabled. > > BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70 > Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305 > ... > This patch fixes the issue by means of locking > client->osdc.lock and client->monc.mutex before > the checking client->osdc.osdmap and > client->monc.monmap in have_mon_and_osd_map() function. > Patch adds locking in the ceph_osdc_stop() > method during the destructruction of osdc->osdmap and > assigning of NULL to the pointer. The lock is used > in the ceph_monc_stop() during the freeing of monc->monmap > and assigning NULL to the pointer too. The monmap_show() > and osdmap_show() methods were reworked to prevent > the potential race condition during the methods call. > > Reported-by: David Howells <dhowells@redhat.com> > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> Reviewed-by: David Howells <dhowells@redhat.com>
On Wed, Mar 5, 2025 at 8:44 PM Viacheslav Dubeyko <slava@dubeyko.com> wrote: > > From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> > > The generic/395 and generic/397 is capable of generating > the oops is on line net/ceph/ceph_common.c:794 with > KASAN enabled. > > BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70 > Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305 > > CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266 > Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 > Call Trace: > <TASK> > dump_stack_lvl+0x57/0x80 > ? have_mon_and_osd_map+0x56/0x70 > print_address_description.constprop.0+0x84/0x330 > ? have_mon_and_osd_map+0x56/0x70 > print_report+0xe2/0x1e0 > ? rcu_read_unlock_sched+0x60/0x80 > ? kmem_cache_debug_flags+0xc/0x20 > ? fixup_red_left+0x17/0x30 > ? have_mon_and_osd_map+0x56/0x70 > kasan_report+0x8d/0xc0 > ? have_mon_and_osd_map+0x56/0x70 > have_mon_and_osd_map+0x56/0x70 > ceph_open_session+0x182/0x290 > ? __pfx_ceph_open_session+0x10/0x10 > ? __init_swait_queue_head+0x8d/0xa0 > ? __pfx_autoremove_wake_function+0x10/0x10 > ? shrinker_register+0xdd/0xf0 > ceph_get_tree+0x333/0x680 > vfs_get_tree+0x49/0x180 > do_new_mount+0x1a3/0x2d0 > ? __pfx_do_new_mount+0x10/0x10 > ? security_capable+0x39/0x70 > path_mount+0x6dd/0x730 > ? __pfx_path_mount+0x10/0x10 > ? kmem_cache_free+0x1e5/0x270 > ? user_path_at+0x48/0x60 > do_mount+0x99/0xe0 > ? __pfx_do_mount+0x10/0x10 > ? lock_release+0x155/0x190 > __do_sys_mount+0x141/0x180 > do_syscall_64+0x9f/0x100 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > RIP: 0033:0x7f01b1b14f3e > Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48 > RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 > RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e > RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20 > RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080 > R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e > R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40 > </TASK> > > Allocated by task 13305: > stack_trace_save+0x8c/0xc0 > kasan_save_stack+0x1e/0x40 > kasan_save_track+0x10/0x30 > __kasan_kmalloc+0x3a/0x50 > __kmalloc_noprof+0x247/0x290 > ceph_osdmap_alloc+0x16/0x130 > ceph_osdc_init+0x27a/0x4c0 > ceph_create_client+0x153/0x190 > create_fs_client+0x50/0x2a0 > ceph_get_tree+0xff/0x680 > vfs_get_tree+0x49/0x180 > do_new_mount+0x1a3/0x2d0 > path_mount+0x6dd/0x730 > do_mount+0x99/0xe0 > __do_sys_mount+0x141/0x180 > do_syscall_64+0x9f/0x100 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > Freed by task 9475: > stack_trace_save+0x8c/0xc0 > kasan_save_stack+0x1e/0x40 > kasan_save_track+0x10/0x30 > kasan_save_free_info+0x3b/0x50 > __kasan_slab_free+0x18/0x30 > kfree+0x212/0x290 > handle_one_map+0x23c/0x3b0 > ceph_osdc_handle_map+0x3c9/0x590 > mon_dispatch+0x655/0x6f0 > ceph_con_process_message+0xc3/0xe0 > ceph_con_v1_try_read+0x614/0x760 > ceph_con_workfn+0x2de/0x650 > process_one_work+0x486/0x7c0 > process_scheduled_works+0x73/0x90 > worker_thread+0x1c8/0x2a0 > kthread+0x2ec/0x300 > ret_from_fork+0x24/0x40 > ret_from_fork_asm+0x1a/0x30 > > The buggy address belongs to the object at ffff88811012d800 > which belongs to the cache kmalloc-512 of size 512 > The buggy address is located 16 bytes inside of > freed 512-byte region [ffff88811012d800, ffff88811012da00) > > The buggy address belongs to the physical page: > page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c > head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 > flags: 0x200000000000040(head|node=0|zone=2) > page_type: f5(slab) > raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122 > raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000 > head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122 > head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000 > head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000 > head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000 > page dumped because: kasan: bad access detected > > Memory state around the buggy address: > ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc > ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc > > ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > > ^ > ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== > Disabling lock debugging due to kernel taint > libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc > libceph: mon0 (1)90.155.74.19:6789 session established > libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc > > We have such scenario: > > Thread 1: > void ceph_osdmap_destroy(...) { > <skipped> > kfree(map); > } > Thread 1 sleep... > > Thread 2: > static bool have_mon_and_osd_map(struct ceph_client *client) { > return client->monc.monmap && client->monc.monmap->epoch && > client->osdc.osdmap && client->osdc.osdmap->epoch; > } > Thread 2 has oops... > > Thread 1 wake up: > static int handle_one_map(...) { > <skipped> > osdc->osdmap = newmap; > <skipped> > } > > This patch fixes the issue by means of locking > client->osdc.lock and client->monc.mutex before > the checking client->osdc.osdmap and > client->monc.monmap in have_mon_and_osd_map() function. > Patch adds locking in the ceph_osdc_stop() > method during the destructruction of osdc->osdmap and > assigning of NULL to the pointer. The lock is used > in the ceph_monc_stop() during the freeing of monc->monmap > and assigning NULL to the pointer too. The monmap_show() > and osdmap_show() methods were reworked to prevent > the potential race condition during the methods call. > > Reported-by: David Howells <dhowells@redhat.com> > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> > --- > net/ceph/ceph_common.c | 14 ++++++++++++-- > net/ceph/debugfs.c | 33 +++++++++++++++++++-------------- > net/ceph/mon_client.c | 9 ++++++++- > net/ceph/osd_client.c | 4 ++++ > 4 files changed, 43 insertions(+), 17 deletions(-) > > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c > index 4c6441536d55..5c8fd78d6bd5 100644 > --- a/net/ceph/ceph_common.c > +++ b/net/ceph/ceph_common.c > @@ -790,8 +790,18 @@ EXPORT_SYMBOL(ceph_reset_client_addr); > */ > static bool have_mon_and_osd_map(struct ceph_client *client) > { > - return client->monc.monmap && client->monc.monmap->epoch && > - client->osdc.osdmap && client->osdc.osdmap->epoch; > + bool have_mon_map = false; > + bool have_osd_map = false; > + > + mutex_lock(&client->monc.mutex); Hi Slava, This introduces a potential sleep into a function that is used as a wait_event condition in __ceph_open_session() which could result in a nested sleep. This is frowned up by the scheduler because the task state may get reset to TASK_RUNNING when not desired: do not call blocking ops when !TASK_RUNNING; state=1 set at [<ffffffff819c19fc>] prepare_to_wait_event+0x3ac/0x460 kernel/sched/wait.c:298 You would probably need to open-code waiting in __ceph_open_session() per [1]. > + have_mon_map = client->monc.monmap && client->monc.monmap->epoch; > + mutex_unlock(&client->monc.mutex); > + > + down_read(&client->osdc.lock); > + have_osd_map = client->osdc.osdmap && client->osdc.osdmap->epoch; > + up_read(&client->osdc.lock); > + > + return have_mon_map && have_osd_map; > } > > /* > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c > index 2110439f8a24..6e2014c813ca 100644 > --- a/net/ceph/debugfs.c > +++ b/net/ceph/debugfs.c > @@ -36,18 +36,20 @@ static int monmap_show(struct seq_file *s, void *p) > int i; > struct ceph_client *client = s->private; > > - if (client->monc.monmap == NULL) > - return 0; > - > - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); > - for (i = 0; i < client->monc.monmap->num_mon; i++) { > - struct ceph_entity_inst *inst = > - &client->monc.monmap->mon_inst[i]; > - > - seq_printf(s, "\t%s%lld\t%s\n", > - ENTITY_NAME(inst->name), > - ceph_pr_addr(&inst->addr)); > + mutex_lock(&client->monc.mutex); > + if (client->monc.monmap) { > + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); > + for (i = 0; i < client->monc.monmap->num_mon; i++) { > + struct ceph_entity_inst *inst = > + &client->monc.monmap->mon_inst[i]; > + > + seq_printf(s, "\t%s%lld\t%s\n", > + ENTITY_NAME(inst->name), > + ceph_pr_addr(&inst->addr)); > + } > } > + mutex_unlock(&client->monc.mutex); Nit: I'd prefer mutex_lock(&client->monc.mutex); if (client->monc.monmap == NULL) goto out_unlock; ... out_unlock: mutex_unlock(&client->monc.mutex); return 0; to an additional level of indentation here. > + > return 0; > } > > @@ -56,13 +58,15 @@ static int osdmap_show(struct seq_file *s, void *p) > int i; > struct ceph_client *client = s->private; > struct ceph_osd_client *osdc = &client->osdc; > - struct ceph_osdmap *map = osdc->osdmap; > + struct ceph_osdmap *map = NULL; > struct rb_node *n; > > + down_read(&osdc->lock); > + > + map = osdc->osdmap; > if (map == NULL) > - return 0; > + goto finish_osdmap_show; > > - down_read(&osdc->lock); > seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, > osdc->epoch_barrier, map->flags); > > @@ -131,6 +135,7 @@ static int osdmap_show(struct seq_file *s, void *p) > seq_printf(s, "]\n"); > } > > +finish_osdmap_show: Nit: I'd rename this label to out_unlock so that name communicates what the label is for. > up_read(&osdc->lock); > return 0; > } > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c > index ab66b599ac47..b299e5bbddb1 100644 > --- a/net/ceph/mon_client.c > +++ b/net/ceph/mon_client.c > @@ -1232,6 +1232,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) > ceph_auth_destroy(monc->auth); > out_monmap: > kfree(monc->monmap); > + monc->monmap = NULL; > out: > return err; > } > @@ -1239,6 +1240,8 @@ EXPORT_SYMBOL(ceph_monc_init); > > void ceph_monc_stop(struct ceph_mon_client *monc) > { > + struct ceph_monmap *old_monmap; > + > dout("stop\n"); > > mutex_lock(&monc->mutex); > @@ -1266,7 +1269,11 @@ void ceph_monc_stop(struct ceph_mon_client *monc) > ceph_msg_put(monc->m_subscribe); > ceph_msg_put(monc->m_subscribe_ack); > > - kfree(monc->monmap); > + mutex_lock(&monc->mutex); Did you have a specific reason for adding locking here and in ceph_osdc_stop()? I see that David inquired about this in a comment on v2 and I think the answer to his question is no. By this point we are long past the point of no return and no new messages can arrive for e.g. handle_one_map() to be called. [1] https://lwn.net/Articles/628628/ Thanks, Ilya
On Tue, 2025-04-08 at 17:20 +0200, Ilya Dryomov wrote: > On Wed, Mar 5, 2025 at 8:44 PM Viacheslav Dubeyko <slava@dubeyko.com> wrote: > > > > From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> > > > > The generic/395 and generic/397 is capable of generating > > the oops is on line net/ceph/ceph_common.c:794 with > > KASAN enabled. > > > > BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70 > > Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305 > > > > CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266 > > Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 > > Call Trace: > > <TASK> > > dump_stack_lvl+0x57/0x80 > > ? have_mon_and_osd_map+0x56/0x70 > > print_address_description.constprop.0+0x84/0x330 > > ? have_mon_and_osd_map+0x56/0x70 > > print_report+0xe2/0x1e0 > > ? rcu_read_unlock_sched+0x60/0x80 > > ? kmem_cache_debug_flags+0xc/0x20 > > ? fixup_red_left+0x17/0x30 > > ? have_mon_and_osd_map+0x56/0x70 > > kasan_report+0x8d/0xc0 > > ? have_mon_and_osd_map+0x56/0x70 > > have_mon_and_osd_map+0x56/0x70 > > ceph_open_session+0x182/0x290 > > ? __pfx_ceph_open_session+0x10/0x10 > > ? __init_swait_queue_head+0x8d/0xa0 > > ? __pfx_autoremove_wake_function+0x10/0x10 > > ? shrinker_register+0xdd/0xf0 > > ceph_get_tree+0x333/0x680 > > vfs_get_tree+0x49/0x180 > > do_new_mount+0x1a3/0x2d0 > > ? __pfx_do_new_mount+0x10/0x10 > > ? security_capable+0x39/0x70 > > path_mount+0x6dd/0x730 > > ? __pfx_path_mount+0x10/0x10 > > ? kmem_cache_free+0x1e5/0x270 > > ? user_path_at+0x48/0x60 > > do_mount+0x99/0xe0 > > ? __pfx_do_mount+0x10/0x10 > > ? lock_release+0x155/0x190 > > __do_sys_mount+0x141/0x180 > > do_syscall_64+0x9f/0x100 > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > RIP: 0033:0x7f01b1b14f3e > > Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48 > > RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 > > RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e > > RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20 > > RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080 > > R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e > > R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40 > > </TASK> > > > > Allocated by task 13305: > > stack_trace_save+0x8c/0xc0 > > kasan_save_stack+0x1e/0x40 > > kasan_save_track+0x10/0x30 > > __kasan_kmalloc+0x3a/0x50 > > __kmalloc_noprof+0x247/0x290 > > ceph_osdmap_alloc+0x16/0x130 > > ceph_osdc_init+0x27a/0x4c0 > > ceph_create_client+0x153/0x190 > > create_fs_client+0x50/0x2a0 > > ceph_get_tree+0xff/0x680 > > vfs_get_tree+0x49/0x180 > > do_new_mount+0x1a3/0x2d0 > > path_mount+0x6dd/0x730 > > do_mount+0x99/0xe0 > > __do_sys_mount+0x141/0x180 > > do_syscall_64+0x9f/0x100 > > entry_SYSCALL_64_after_hwframe+0x76/0x7e > > > > Freed by task 9475: > > stack_trace_save+0x8c/0xc0 > > kasan_save_stack+0x1e/0x40 > > kasan_save_track+0x10/0x30 > > kasan_save_free_info+0x3b/0x50 > > __kasan_slab_free+0x18/0x30 > > kfree+0x212/0x290 > > handle_one_map+0x23c/0x3b0 > > ceph_osdc_handle_map+0x3c9/0x590 > > mon_dispatch+0x655/0x6f0 > > ceph_con_process_message+0xc3/0xe0 > > ceph_con_v1_try_read+0x614/0x760 > > ceph_con_workfn+0x2de/0x650 > > process_one_work+0x486/0x7c0 > > process_scheduled_works+0x73/0x90 > > worker_thread+0x1c8/0x2a0 > > kthread+0x2ec/0x300 > > ret_from_fork+0x24/0x40 > > ret_from_fork_asm+0x1a/0x30 > > > > The buggy address belongs to the object at ffff88811012d800 > > which belongs to the cache kmalloc-512 of size 512 > > The buggy address is located 16 bytes inside of > > freed 512-byte region [ffff88811012d800, ffff88811012da00) > > > > The buggy address belongs to the physical page: > > page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c > > head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 > > flags: 0x200000000000040(head|node=0|zone=2) > > page_type: f5(slab) > > raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122 > > raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000 > > head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122 > > head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000 > > head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000 > > head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000 > > page dumped because: kasan: bad access detected > > > > Memory state around the buggy address: > > ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc > > ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc > > > > ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > > > > ^ > > ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > > ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== > > Disabling lock debugging due to kernel taint > > libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc > > libceph: mon0 (1)90.155.74.19:6789 session established > > libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc > > > > We have such scenario: > > > > Thread 1: > > void ceph_osdmap_destroy(...) { > > <skipped> > > kfree(map); > > } > > Thread 1 sleep... > > > > Thread 2: > > static bool have_mon_and_osd_map(struct ceph_client *client) { > > return client->monc.monmap && client->monc.monmap->epoch && > > client->osdc.osdmap && client->osdc.osdmap->epoch; > > } > > Thread 2 has oops... > > > > Thread 1 wake up: > > static int handle_one_map(...) { > > <skipped> > > osdc->osdmap = newmap; > > <skipped> > > } > > > > This patch fixes the issue by means of locking > > client->osdc.lock and client->monc.mutex before > > the checking client->osdc.osdmap and > > client->monc.monmap in have_mon_and_osd_map() function. > > Patch adds locking in the ceph_osdc_stop() > > method during the destructruction of osdc->osdmap and > > assigning of NULL to the pointer. The lock is used > > in the ceph_monc_stop() during the freeing of monc->monmap > > and assigning NULL to the pointer too. The monmap_show() > > and osdmap_show() methods were reworked to prevent > > the potential race condition during the methods call. > > > > Reported-by: David Howells <dhowells@redhat.com> > > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> > > --- > > net/ceph/ceph_common.c | 14 ++++++++++++-- > > net/ceph/debugfs.c | 33 +++++++++++++++++++-------------- > > net/ceph/mon_client.c | 9 ++++++++- > > net/ceph/osd_client.c | 4 ++++ > > 4 files changed, 43 insertions(+), 17 deletions(-) > > > > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c > > index 4c6441536d55..5c8fd78d6bd5 100644 > > --- a/net/ceph/ceph_common.c > > +++ b/net/ceph/ceph_common.c > > @@ -790,8 +790,18 @@ EXPORT_SYMBOL(ceph_reset_client_addr); > > */ > > static bool have_mon_and_osd_map(struct ceph_client *client) > > { > > - return client->monc.monmap && client->monc.monmap->epoch && > > - client->osdc.osdmap && client->osdc.osdmap->epoch; > > + bool have_mon_map = false; > > + bool have_osd_map = false; > > + > > + mutex_lock(&client->monc.mutex); > > Hi Slava, > > This introduces a potential sleep into a function that is used as > a wait_event condition in __ceph_open_session() which could result in > a nested sleep. This is frowned up by the scheduler because the task > state may get reset to TASK_RUNNING when not desired: > > do not call blocking ops when !TASK_RUNNING; state=1 set at > [<ffffffff819c19fc>] prepare_to_wait_event+0x3ac/0x460 > kernel/sched/wait.c:298 > Yeah, I missed the call of have_mon_and_osd_map() here: err = wait_event_interruptible_timeout(client->auth_wq, have_mon_and_osd_map(client) || (client->auth_err < 0), ceph_timeout_jiffies(timeout)); > You would probably need to open-code waiting in __ceph_open_session() > per [1]. > Maybe, we need to rework the waiting approach here? Could we check some condition that it doesn't require to use the locking at all? Maybe, we can use some atomic_t variable that will summarize the state? > > + have_mon_map = client->monc.monmap && client->monc.monmap->epoch; > > + mutex_unlock(&client->monc.mutex); > > + > > + down_read(&client->osdc.lock); > > + have_osd_map = client->osdc.osdmap && client->osdc.osdmap->epoch; > > + up_read(&client->osdc.lock); > > + > > + return have_mon_map && have_osd_map; > > } > > > > /* > > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c > > index 2110439f8a24..6e2014c813ca 100644 > > --- a/net/ceph/debugfs.c > > +++ b/net/ceph/debugfs.c > > @@ -36,18 +36,20 @@ static int monmap_show(struct seq_file *s, void *p) > > int i; > > struct ceph_client *client = s->private; > > > > - if (client->monc.monmap == NULL) > > - return 0; > > - > > - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); > > - for (i = 0; i < client->monc.monmap->num_mon; i++) { > > - struct ceph_entity_inst *inst = > > - &client->monc.monmap->mon_inst[i]; > > - > > - seq_printf(s, "\t%s%lld\t%s\n", > > - ENTITY_NAME(inst->name), > > - ceph_pr_addr(&inst->addr)); > > + mutex_lock(&client->monc.mutex); > > + if (client->monc.monmap) { > > + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); > > + for (i = 0; i < client->monc.monmap->num_mon; i++) { > > + struct ceph_entity_inst *inst = > > + &client->monc.monmap->mon_inst[i]; > > + > > + seq_printf(s, "\t%s%lld\t%s\n", > > + ENTITY_NAME(inst->name), > > + ceph_pr_addr(&inst->addr)); > > + } > > } > > + mutex_unlock(&client->monc.mutex); > > Nit: I'd prefer > > mutex_lock(&client->monc.mutex); > if (client->monc.monmap == NULL) > goto out_unlock; > > ... > > out_unlock: > mutex_unlock(&client->monc.mutex); > return 0; > > to an additional level of indentation here. > OK, I can do this. > > + > > return 0; > > } > > > > @@ -56,13 +58,15 @@ static int osdmap_show(struct seq_file *s, void *p) > > int i; > > struct ceph_client *client = s->private; > > struct ceph_osd_client *osdc = &client->osdc; > > - struct ceph_osdmap *map = osdc->osdmap; > > + struct ceph_osdmap *map = NULL; > > struct rb_node *n; > > > > + down_read(&osdc->lock); > > + > > + map = osdc->osdmap; > > if (map == NULL) > > - return 0; > > + goto finish_osdmap_show; > > > > - down_read(&osdc->lock); > > seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, > > osdc->epoch_barrier, map->flags); > > > > @@ -131,6 +135,7 @@ static int osdmap_show(struct seq_file *s, void *p) > > seq_printf(s, "]\n"); > > } > > > > +finish_osdmap_show: > > Nit: I'd rename this label to out_unlock so that name communicates what > the label is for. > OK, sounds good. > > up_read(&osdc->lock); > > return 0; > > } > > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c > > index ab66b599ac47..b299e5bbddb1 100644 > > --- a/net/ceph/mon_client.c > > +++ b/net/ceph/mon_client.c > > @@ -1232,6 +1232,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) > > ceph_auth_destroy(monc->auth); > > out_monmap: > > kfree(monc->monmap); > > + monc->monmap = NULL; > > out: > > return err; > > } > > @@ -1239,6 +1240,8 @@ EXPORT_SYMBOL(ceph_monc_init); > > > > void ceph_monc_stop(struct ceph_mon_client *monc) > > { > > + struct ceph_monmap *old_monmap; > > + > > dout("stop\n"); > > > > mutex_lock(&monc->mutex); > > @@ -1266,7 +1269,11 @@ void ceph_monc_stop(struct ceph_mon_client *monc) > > ceph_msg_put(monc->m_subscribe); > > ceph_msg_put(monc->m_subscribe_ack); > > > > - kfree(monc->monmap); > > + mutex_lock(&monc->mutex); > > Did you have a specific reason for adding locking here and in > ceph_osdc_stop()? I see that David inquired about this in a comment > on v2 and I think the answer to his question is no. > I assume that you mean this: >> Is >> that a "just in case" thing? It doesn't look like the client can get >> resurrected afterwards, but I may have missed something. If it's not just in >> case, does the access and clearance of the pointers need wrapping in the >> appropriate lock? >> > >Let me double check it. Probably, you are right here. My understanding of this comment was that we need to be more accurate with pointers access and it needs to add the locking during the destruction. So, I promised to double check it and I shared v.3 of the patch. It was my vision of the world for that moment. :) > By this point we > are long past the point of no return and no new messages can arrive for > e.g. handle_one_map() to be called. > My idea was simple. Some threads can potentially try to open session during or after ceph_monc_stop() and ceph_osdc_stop() calls. Do you mean we can guarantee that such situation will never happen? Thanks, Slava.
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4c6441536d55..5c8fd78d6bd5 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -790,8 +790,18 @@ EXPORT_SYMBOL(ceph_reset_client_addr); */ static bool have_mon_and_osd_map(struct ceph_client *client) { - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; + bool have_mon_map = false; + bool have_osd_map = false; + + mutex_lock(&client->monc.mutex); + have_mon_map = client->monc.monmap && client->monc.monmap->epoch; + mutex_unlock(&client->monc.mutex); + + down_read(&client->osdc.lock); + have_osd_map = client->osdc.osdmap && client->osdc.osdmap->epoch; + up_read(&client->osdc.lock); + + return have_mon_map && have_osd_map; } /* diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 2110439f8a24..6e2014c813ca 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -36,18 +36,20 @@ static int monmap_show(struct seq_file *s, void *p) int i; struct ceph_client *client = s->private; - if (client->monc.monmap == NULL) - return 0; - - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); - for (i = 0; i < client->monc.monmap->num_mon; i++) { - struct ceph_entity_inst *inst = - &client->monc.monmap->mon_inst[i]; - - seq_printf(s, "\t%s%lld\t%s\n", - ENTITY_NAME(inst->name), - ceph_pr_addr(&inst->addr)); + mutex_lock(&client->monc.mutex); + if (client->monc.monmap) { + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + ceph_pr_addr(&inst->addr)); + } } + mutex_unlock(&client->monc.mutex); + return 0; } @@ -56,13 +58,15 @@ static int osdmap_show(struct seq_file *s, void *p) int i; struct ceph_client *client = s->private; struct ceph_osd_client *osdc = &client->osdc; - struct ceph_osdmap *map = osdc->osdmap; + struct ceph_osdmap *map = NULL; struct rb_node *n; + down_read(&osdc->lock); + + map = osdc->osdmap; if (map == NULL) - return 0; + goto finish_osdmap_show; - down_read(&osdc->lock); seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, osdc->epoch_barrier, map->flags); @@ -131,6 +135,7 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "]\n"); } +finish_osdmap_show: up_read(&osdc->lock); return 0; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index ab66b599ac47..b299e5bbddb1 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1232,6 +1232,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ceph_auth_destroy(monc->auth); out_monmap: kfree(monc->monmap); + monc->monmap = NULL; out: return err; } @@ -1239,6 +1240,8 @@ EXPORT_SYMBOL(ceph_monc_init); void ceph_monc_stop(struct ceph_mon_client *monc) { + struct ceph_monmap *old_monmap; + dout("stop\n"); mutex_lock(&monc->mutex); @@ -1266,7 +1269,11 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_msg_put(monc->m_subscribe); ceph_msg_put(monc->m_subscribe_ack); - kfree(monc->monmap); + mutex_lock(&monc->mutex); + old_monmap = monc->monmap; + monc->monmap = NULL; + mutex_unlock(&monc->mutex); + kfree(old_monmap); } EXPORT_SYMBOL(ceph_monc_stop); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b24afec24138..762f4df5763b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5278,6 +5278,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) mempool_destroy(osdc->req_mempool); out_map: ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = NULL; out: return err; } @@ -5306,10 +5307,13 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) WARN_ON(atomic_read(&osdc->num_requests)); WARN_ON(atomic_read(&osdc->num_homeless)); + down_write(&osdc->lock); ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = NULL; mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); + up_write(&osdc->lock); } int osd_req_op_copy_from_init(struct ceph_osd_request *req,