Message ID | 20210810163731.2qvfuhenolq2gdlv@linutronix.de |
---|---|
State | New |
Headers | show |
Series | [ANNOUNCE] v5.14-rc5-rt8 | expand |
On 8/12/21 10:18 PM, Clark Williams wrote: > On Tue, 10 Aug 2021 18:37:31 +0200 > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > Sebastian, et al, > > Got the following panic running v5.14-rc5-rt8: > > Aug 13 06:35:05 oberon kernel: page:000000009ac5dd73 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1ab3db > Aug 13 06:35:05 oberon kernel: flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) > Aug 13 06:35:05 oberon kernel: raw: 0017ffffc0000000 ffffee1286aceb88 ffffee1287b66288 0000000000000000 > Aug 13 06:35:05 oberon kernel: raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 > Aug 13 06:35:05 oberon kernel: page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) > Aug 13 06:35:05 oberon kernel: ------------[ cut here ]------------ > Aug 13 06:35:05 oberon kernel: kernel BUG at include/linux/page-flags.h:814! > Aug 13 06:35:05 oberon kernel: invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI > Aug 13 06:35:05 oberon kernel: CPU: 3 PID: 12345 Comm: hackbench Not tainted 5.14.0-rc5-rt8+ #12 > Aug 13 06:35:05 oberon kernel: Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0359.2016.0906.1028 09/06/2016 > Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 Are you able to translate this RIP via addr2line? > Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> > Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 > Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 > Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff > Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 > Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 > Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 > Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 > Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > Aug 13 06:35:05 oberon kernel: CR2: 00007f60c04bdaf8 CR3: 0000000124f3a003 CR4: 00000000003706e0 > Aug 13 06:35:05 oberon kernel: Call Trace: > Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 > Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 > Aug 13 06:35:05 oberon kernel: ? kmem_cache_alloc_node+0xa4/0x2b0 > Aug 13 06:35:05 oberon kernel: kmem_cache_alloc_node+0xa4/0x2b0 > Aug 13 06:35:05 oberon kernel: __alloc_skb+0x1db/0x270 > Aug 13 06:35:05 oberon kernel: alloc_skb_with_frags+0x64/0x250 > Aug 13 06:35:05 oberon kernel: sock_alloc_send_pskb+0x260/0x2b0 > Aug 13 06:35:05 oberon kernel: ? bpf_lsm_socket_getpeersec_dgram+0xa/0x10 > Aug 13 06:35:05 oberon kernel: unix_stream_sendmsg+0x27c/0x550 > Aug 13 06:35:05 oberon kernel: ? unix_seqpacket_recvmsg+0x60/0x60 > Aug 13 06:35:05 oberon kernel: sock_sendmsg+0xbd/0xd0 > Aug 13 06:35:05 oberon kernel: sock_write_iter+0xb9/0x120 > Aug 13 06:35:05 oberon kernel: new_sync_write+0x175/0x200 > Aug 13 06:35:05 oberon kernel: vfs_write+0x3c4/0x510 > Aug 13 06:35:05 oberon kernel: ksys_write+0xc9/0x110 > Aug 13 06:35:05 oberon kernel: do_syscall_64+0x3b/0x90 > Aug 13 06:35:05 oberon kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae While SLUB RT rewrite is obvious suspect, could be also a boring slab misuse (use-after-free etc), wouldn't be the first related to skb's... If this reproduces well, could you try booting with slub_debug boot param. Could catch the culprit sooner (but also hide the bug, unfortunately). > Aug 13 06:35:05 oberon kernel: RIP: 0033:0x7f474b3a2877 > Aug 13 06:35:05 oberon kernel: Code: 75 05 48 83 c4 58 c3 e8 37 4e ff ff 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 0> > Aug 13 06:35:05 oberon kernel: RSP: 002b:00007ffe5e71e7a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > Aug 13 06:35:05 oberon kernel: RAX: ffffffffffffffda RBX: 00000000000003e8 RCX: 00007f474b3a2877 > Aug 13 06:35:05 oberon kernel: RDX: 00000000000003e8 RSI: 00007ffe5e71e7b0 RDI: 0000000000000010 > Aug 13 06:35:05 oberon kernel: RBP: 00007ffe5e71ebf0 R08: 00007ffe5e71e700 R09: 0000000000000000 > Aug 13 06:35:05 oberon kernel: R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffe5e71e7b0 > Aug 13 06:35:05 oberon kernel: R13: 0000000000000008 R14: 0000560b46008210 R15: 0000000000000000 > Aug 13 06:35:05 oberon kernel: Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_nat_tf> > Aug 13 06:35:05 oberon kernel: snd_intel_dspcfg snd_hda_codec mei_hdcp snd_hda_core snd_hwdep cfg80211 snd_seq iTCO_wdt snd_seq_device intel_pmc_bxt> > Aug 13 06:35:05 oberon kernel: ---[ end trace 0000000000000002 ]--- > Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 > Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> > Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 > Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 > Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff > Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 > Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 > Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 > Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 > Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > Aug 13 06:35:05 oberon kernel: CR2: 00007f60c1d901a0 CR3: 0000000124f3a003 CR4: 00000000003706e0 > Aug 13 06:35:05 oberon kernel: Kernel panic - not syncing: > > Config is attached. > > I was running the rteval script that kicks off parallel kernel builds and hackbench runs as loads and runs cyclictest with a thread on each core: > > $ sudo rteval --duration=10m > > Clark >
On Thu, 12 Aug 2021 22:45:19 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > On 8/12/21 10:18 PM, Clark Williams wrote: > > On Tue, 10 Aug 2021 18:37:31 +0200 > > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > > > Sebastian, et al, > > > > Got the following panic running v5.14-rc5-rt8: > > > > Aug 13 06:35:05 oberon kernel: page:000000009ac5dd73 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1ab3db > > Aug 13 06:35:05 oberon kernel: flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) > > Aug 13 06:35:05 oberon kernel: raw: 0017ffffc0000000 ffffee1286aceb88 ffffee1287b66288 0000000000000000 > > Aug 13 06:35:05 oberon kernel: raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 > > Aug 13 06:35:05 oberon kernel: page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) > > Aug 13 06:35:05 oberon kernel: ------------[ cut here ]------------ > > Aug 13 06:35:05 oberon kernel: kernel BUG at include/linux/page-flags.h:814! > > Aug 13 06:35:05 oberon kernel: invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI > > Aug 13 06:35:05 oberon kernel: CPU: 3 PID: 12345 Comm: hackbench Not tainted 5.14.0-rc5-rt8+ #12 > > Aug 13 06:35:05 oberon kernel: Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0359.2016.0906.1028 09/06/2016 > > Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 > > Are you able to translate this RIP via addr2line? $ addr2line -e /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 <snip>/arch/x86/include/asm/processor.h:440 > > > Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> > > Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 > > Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 > > Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff > > Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 > > Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 > > Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 > > Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 > > Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > > Aug 13 06:35:05 oberon kernel: CR2: 00007f60c04bdaf8 CR3: 0000000124f3a003 CR4: 00000000003706e0 > > Aug 13 06:35:05 oberon kernel: Call Trace: > > Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 > > Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 > > Aug 13 06:35:05 oberon kernel: ? kmem_cache_alloc_node+0xa4/0x2b0 > > Aug 13 06:35:05 oberon kernel: kmem_cache_alloc_node+0xa4/0x2b0 > > Aug 13 06:35:05 oberon kernel: __alloc_skb+0x1db/0x270 > > Aug 13 06:35:05 oberon kernel: alloc_skb_with_frags+0x64/0x250 > > Aug 13 06:35:05 oberon kernel: sock_alloc_send_pskb+0x260/0x2b0 > > Aug 13 06:35:05 oberon kernel: ? bpf_lsm_socket_getpeersec_dgram+0xa/0x10 > > Aug 13 06:35:05 oberon kernel: unix_stream_sendmsg+0x27c/0x550 > > Aug 13 06:35:05 oberon kernel: ? unix_seqpacket_recvmsg+0x60/0x60 > > Aug 13 06:35:05 oberon kernel: sock_sendmsg+0xbd/0xd0 > > Aug 13 06:35:05 oberon kernel: sock_write_iter+0xb9/0x120 > > Aug 13 06:35:05 oberon kernel: new_sync_write+0x175/0x200 > > Aug 13 06:35:05 oberon kernel: vfs_write+0x3c4/0x510 > > Aug 13 06:35:05 oberon kernel: ksys_write+0xc9/0x110 > > Aug 13 06:35:05 oberon kernel: do_syscall_64+0x3b/0x90 > > Aug 13 06:35:05 oberon kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae > > While SLUB RT rewrite is obvious suspect, could be also a boring slab > misuse (use-after-free etc), wouldn't be the first related to skb's... > If this reproduces well, could you try booting with slub_debug boot > param. Could catch the culprit sooner (but also hide the bug, > unfortunately). Without 'slub_debug' it panic's consistently. Adding 'slub_debug' to the boot command line causes odd behavior where load and measurement tasks run by 'rteval' crash, but (so far) no kernel panic. > > > Aug 13 06:35:05 oberon kernel: RIP: 0033:0x7f474b3a2877 > > Aug 13 06:35:05 oberon kernel: Code: 75 05 48 83 c4 58 c3 e8 37 4e ff ff 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 0> > > Aug 13 06:35:05 oberon kernel: RSP: 002b:00007ffe5e71e7a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > > Aug 13 06:35:05 oberon kernel: RAX: ffffffffffffffda RBX: 00000000000003e8 RCX: 00007f474b3a2877 > > Aug 13 06:35:05 oberon kernel: RDX: 00000000000003e8 RSI: 00007ffe5e71e7b0 RDI: 0000000000000010 > > Aug 13 06:35:05 oberon kernel: RBP: 00007ffe5e71ebf0 R08: 00007ffe5e71e700 R09: 0000000000000000 > > Aug 13 06:35:05 oberon kernel: R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffe5e71e7b0 > > Aug 13 06:35:05 oberon kernel: R13: 0000000000000008 R14: 0000560b46008210 R15: 0000000000000000 > > Aug 13 06:35:05 oberon kernel: Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_nat_tf> > > Aug 13 06:35:05 oberon kernel: snd_intel_dspcfg snd_hda_codec mei_hdcp snd_hda_core snd_hwdep cfg80211 snd_seq iTCO_wdt snd_seq_device intel_pmc_bxt> > > Aug 13 06:35:05 oberon kernel: ---[ end trace 0000000000000002 ]--- > > Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 > > Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> > > Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 > > Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 > > Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff > > Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 > > Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 > > Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 > > Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 > > Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > > Aug 13 06:35:05 oberon kernel: CR2: 00007f60c1d901a0 CR3: 0000000124f3a003 CR4: 00000000003706e0 > > Aug 13 06:35:05 oberon kernel: Kernel panic - not syncing: > > > > Config is attached. > > > > I was running the rteval script that kicks off parallel kernel builds and hackbench runs as loads and runs cyclictest with a thread on each core: > > > > $ sudo rteval --duration=10m > > > > Clark > > > -- The United States Coast Guard Ruining Natural Selection since 1790
On 8/12/21 11:24 PM, Clark Williams wrote: > On Thu, 12 Aug 2021 22:45:19 +0200 > Vlastimil Babka <vbabka@suse.cz> wrote: > >> On 8/12/21 10:18 PM, Clark Williams wrote: >>> On Tue, 10 Aug 2021 18:37:31 +0200 >>> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: >>> >>> Sebastian, et al, >>> >>> Got the following panic running v5.14-rc5-rt8: >>> >>> Aug 13 06:35:05 oberon kernel: page:000000009ac5dd73 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1ab3db >>> Aug 13 06:35:05 oberon kernel: flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) >>> Aug 13 06:35:05 oberon kernel: raw: 0017ffffc0000000 ffffee1286aceb88 ffffee1287b66288 0000000000000000 >>> Aug 13 06:35:05 oberon kernel: raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 >>> Aug 13 06:35:05 oberon kernel: page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) >>> Aug 13 06:35:05 oberon kernel: ------------[ cut here ]------------ >>> Aug 13 06:35:05 oberon kernel: kernel BUG at include/linux/page-flags.h:814! >>> Aug 13 06:35:05 oberon kernel: invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI >>> Aug 13 06:35:05 oberon kernel: CPU: 3 PID: 12345 Comm: hackbench Not tainted 5.14.0-rc5-rt8+ #12 >>> Aug 13 06:35:05 oberon kernel: Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0359.2016.0906.1028 09/06/2016 >>> Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 >> >> Are you able to translate this RIP via addr2line? > > $ addr2line -e /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 > <snip>/arch/x86/include/asm/processor.h:440 Hm that's not much useful, I'd need the line in mm/slub.c does ./scripts/faddr2line give better output? >> >>> Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> >>> Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 >>> Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 >>> Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff >>> Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 >>> Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 >>> Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 >>> Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 >>> Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >>> Aug 13 06:35:05 oberon kernel: CR2: 00007f60c04bdaf8 CR3: 0000000124f3a003 CR4: 00000000003706e0 >>> Aug 13 06:35:05 oberon kernel: Call Trace: >>> Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 >>> Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 >>> Aug 13 06:35:05 oberon kernel: ? kmem_cache_alloc_node+0xa4/0x2b0 >>> Aug 13 06:35:05 oberon kernel: kmem_cache_alloc_node+0xa4/0x2b0 >>> Aug 13 06:35:05 oberon kernel: __alloc_skb+0x1db/0x270 >>> Aug 13 06:35:05 oberon kernel: alloc_skb_with_frags+0x64/0x250 >>> Aug 13 06:35:05 oberon kernel: sock_alloc_send_pskb+0x260/0x2b0 >>> Aug 13 06:35:05 oberon kernel: ? bpf_lsm_socket_getpeersec_dgram+0xa/0x10 >>> Aug 13 06:35:05 oberon kernel: unix_stream_sendmsg+0x27c/0x550 >>> Aug 13 06:35:05 oberon kernel: ? unix_seqpacket_recvmsg+0x60/0x60 >>> Aug 13 06:35:05 oberon kernel: sock_sendmsg+0xbd/0xd0 >>> Aug 13 06:35:05 oberon kernel: sock_write_iter+0xb9/0x120 >>> Aug 13 06:35:05 oberon kernel: new_sync_write+0x175/0x200 >>> Aug 13 06:35:05 oberon kernel: vfs_write+0x3c4/0x510 >>> Aug 13 06:35:05 oberon kernel: ksys_write+0xc9/0x110 >>> Aug 13 06:35:05 oberon kernel: do_syscall_64+0x3b/0x90 >>> Aug 13 06:35:05 oberon kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae >> >> While SLUB RT rewrite is obvious suspect, could be also a boring slab >> misuse (use-after-free etc), wouldn't be the first related to skb's... >> If this reproduces well, could you try booting with slub_debug boot >> param. Could catch the culprit sooner (but also hide the bug, >> unfortunately). > > Without 'slub_debug' it panic's consistently. > > Adding 'slub_debug' to the boot command line causes odd behavior where load and measurement tasks > run by 'rteval' crash, but (so far) no kernel panic. Huh. And nothing in dmesg about corrupted slabs with slub_debug? >>> Aug 13 06:35:05 oberon kernel: RIP: 0033:0x7f474b3a2877 >>> Aug 13 06:35:05 oberon kernel: Code: 75 05 48 83 c4 58 c3 e8 37 4e ff ff 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 0> >>> Aug 13 06:35:05 oberon kernel: RSP: 002b:00007ffe5e71e7a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 >>> Aug 13 06:35:05 oberon kernel: RAX: ffffffffffffffda RBX: 00000000000003e8 RCX: 00007f474b3a2877 >>> Aug 13 06:35:05 oberon kernel: RDX: 00000000000003e8 RSI: 00007ffe5e71e7b0 RDI: 0000000000000010 >>> Aug 13 06:35:05 oberon kernel: RBP: 00007ffe5e71ebf0 R08: 00007ffe5e71e700 R09: 0000000000000000 >>> Aug 13 06:35:05 oberon kernel: R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffe5e71e7b0 >>> Aug 13 06:35:05 oberon kernel: R13: 0000000000000008 R14: 0000560b46008210 R15: 0000000000000000 >>> Aug 13 06:35:05 oberon kernel: Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_nat_tf> >>> Aug 13 06:35:05 oberon kernel: snd_intel_dspcfg snd_hda_codec mei_hdcp snd_hda_core snd_hwdep cfg80211 snd_seq iTCO_wdt snd_seq_device intel_pmc_bxt> >>> Aug 13 06:35:05 oberon kernel: ---[ end trace 0000000000000002 ]--- >>> Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 >>> Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> >>> Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 >>> Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 >>> Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff >>> Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 >>> Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 >>> Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 >>> Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 >>> Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >>> Aug 13 06:35:05 oberon kernel: CR2: 00007f60c1d901a0 CR3: 0000000124f3a003 CR4: 00000000003706e0 >>> Aug 13 06:35:05 oberon kernel: Kernel panic - not syncing: >>> >>> Config is attached. >>> >>> I was running the rteval script that kicks off parallel kernel builds and hackbench runs as loads and runs cyclictest with a thread on each core: >>> >>> $ sudo rteval --duration=10m >>> >>> Clark >>> >> > >
On 8/12/21 10:18 PM, Clark Williams wrote: > On Tue, 10 Aug 2021 18:37:31 +0200 > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > Sebastian, et al, > > Got the following panic running v5.14-rc5-rt8: BTW, which was the last version that worked for you in this test? The SLUB changes in rt8 should have been minimal, and related to hotplug. On the other hand, if the previous working one was v5.14-rc4 based, the problem could be in rc5...
On Thu, 12 Aug 2021 23:30:29 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > On 8/12/21 11:24 PM, Clark Williams wrote: > > On Thu, 12 Aug 2021 22:45:19 +0200 > > Vlastimil Babka <vbabka@suse.cz> wrote: > > > >> On 8/12/21 10:18 PM, Clark Williams wrote: > >>> On Tue, 10 Aug 2021 18:37:31 +0200 > >>> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > >>> > >>> Sebastian, et al, > >>> > >>> Got the following panic running v5.14-rc5-rt8: > >>> > >>> Aug 13 06:35:05 oberon kernel: page:000000009ac5dd73 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1ab3db > >>> Aug 13 06:35:05 oberon kernel: flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) > >>> Aug 13 06:35:05 oberon kernel: raw: 0017ffffc0000000 ffffee1286aceb88 ffffee1287b66288 0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) > >>> Aug 13 06:35:05 oberon kernel: ------------[ cut here ]------------ > >>> Aug 13 06:35:05 oberon kernel: kernel BUG at include/linux/page-flags.h:814! > >>> Aug 13 06:35:05 oberon kernel: invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI > >>> Aug 13 06:35:05 oberon kernel: CPU: 3 PID: 12345 Comm: hackbench Not tainted 5.14.0-rc5-rt8+ #12 > >>> Aug 13 06:35:05 oberon kernel: Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0359.2016.0906.1028 09/06/2016 > >>> Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 > >> > >> Are you able to translate this RIP via addr2line? > > > > $ addr2line -e /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 > > <snip>/arch/x86/include/asm/processor.h:440 > > Hm that's not much useful, I'd need the line in mm/slub.c > does ./scripts/faddr2line give better output? Why, yes it does! :) $ ./scripts/faddr2line /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 ___slab_alloc+0x340/0x940: PageSlabPfmemalloc at include/linux/page-flags.h:814 (inlined by) pfmemalloc_match at mm/slub.c:2772 (inlined by) ___slab_alloc at mm/slub.c:2874 > > >> > >>> Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> > >>> Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 > >>> Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 > >>> Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff > >>> Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 > >>> Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 > >>> Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > >>> Aug 13 06:35:05 oberon kernel: CR2: 00007f60c04bdaf8 CR3: 0000000124f3a003 CR4: 00000000003706e0 > >>> Aug 13 06:35:05 oberon kernel: Call Trace: > >>> Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 > >>> Aug 13 06:35:05 oberon kernel: ? __alloc_skb+0x1db/0x270 > >>> Aug 13 06:35:05 oberon kernel: ? kmem_cache_alloc_node+0xa4/0x2b0 > >>> Aug 13 06:35:05 oberon kernel: kmem_cache_alloc_node+0xa4/0x2b0 > >>> Aug 13 06:35:05 oberon kernel: __alloc_skb+0x1db/0x270 > >>> Aug 13 06:35:05 oberon kernel: alloc_skb_with_frags+0x64/0x250 > >>> Aug 13 06:35:05 oberon kernel: sock_alloc_send_pskb+0x260/0x2b0 > >>> Aug 13 06:35:05 oberon kernel: ? bpf_lsm_socket_getpeersec_dgram+0xa/0x10 > >>> Aug 13 06:35:05 oberon kernel: unix_stream_sendmsg+0x27c/0x550 > >>> Aug 13 06:35:05 oberon kernel: ? unix_seqpacket_recvmsg+0x60/0x60 > >>> Aug 13 06:35:05 oberon kernel: sock_sendmsg+0xbd/0xd0 > >>> Aug 13 06:35:05 oberon kernel: sock_write_iter+0xb9/0x120 > >>> Aug 13 06:35:05 oberon kernel: new_sync_write+0x175/0x200 > >>> Aug 13 06:35:05 oberon kernel: vfs_write+0x3c4/0x510 > >>> Aug 13 06:35:05 oberon kernel: ksys_write+0xc9/0x110 > >>> Aug 13 06:35:05 oberon kernel: do_syscall_64+0x3b/0x90 > >>> Aug 13 06:35:05 oberon kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae > >> > >> While SLUB RT rewrite is obvious suspect, could be also a boring slab > >> misuse (use-after-free etc), wouldn't be the first related to skb's... > >> If this reproduces well, could you try booting with slub_debug boot > >> param. Could catch the culprit sooner (but also hide the bug, > >> unfortunately). > > > > Without 'slub_debug' it panic's consistently. > > > > Adding 'slub_debug' to the boot command line causes odd behavior where load and measurement tasks > > run by 'rteval' crash, but (so far) no kernel panic. > > Huh. And nothing in dmesg about corrupted slabs with slub_debug? No, nothing slab related in the log. Typical gnome-shell complaints and a lockdep complaint about MAX_LOCKDEP_ENTRIES too low, but nothing really wierd. > > >>> Aug 13 06:35:05 oberon kernel: RIP: 0033:0x7f474b3a2877 > >>> Aug 13 06:35:05 oberon kernel: Code: 75 05 48 83 c4 58 c3 e8 37 4e ff ff 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 0> > >>> Aug 13 06:35:05 oberon kernel: RSP: 002b:00007ffe5e71e7a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > >>> Aug 13 06:35:05 oberon kernel: RAX: ffffffffffffffda RBX: 00000000000003e8 RCX: 00007f474b3a2877 > >>> Aug 13 06:35:05 oberon kernel: RDX: 00000000000003e8 RSI: 00007ffe5e71e7b0 RDI: 0000000000000010 > >>> Aug 13 06:35:05 oberon kernel: RBP: 00007ffe5e71ebf0 R08: 00007ffe5e71e700 R09: 0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffe5e71e7b0 > >>> Aug 13 06:35:05 oberon kernel: R13: 0000000000000008 R14: 0000560b46008210 R15: 0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_nat_tf> > >>> Aug 13 06:35:05 oberon kernel: snd_intel_dspcfg snd_hda_codec mei_hdcp snd_hda_core snd_hwdep cfg80211 snd_seq iTCO_wdt snd_seq_device intel_pmc_bxt> > >>> Aug 13 06:35:05 oberon kernel: ---[ end trace 0000000000000002 ]--- > >>> Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 > >>> Aug 13 06:35:05 oberon kernel: Code: c6 48 0f a3 05 b1 7b 57 03 72 99 c7 85 78 ff ff ff ff ff ff ff 48 8b 7d 88 e9 8d fd ff ff 48 c7 c6 50 5a 7c b0 e> > >>> Aug 13 06:35:05 oberon kernel: RSP: 0018:ffffba1c4a8b7ab0 EFLAGS: 00010293 > >>> Aug 13 06:35:05 oberon kernel: RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff9bb765118000 > >>> Aug 13 06:35:05 oberon kernel: RDX: 0000000000000000 RSI: ffffffffaf426050 RDI: 00000000ffffffff > >>> Aug 13 06:35:05 oberon kernel: RBP: ffffba1c4a8b7b70 R08: 0000000000000000 R09: 0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff9bb7410d3600 > >>> Aug 13 06:35:05 oberon kernel: R13: 0000000000400cc0 R14: 00000000001f7770 R15: ffff9bbe76df7770 > >>> Aug 13 06:35:05 oberon kernel: FS: 00007f474b1be740(0000) GS:ffff9bbe76c00000(0000) knlGS:0000000000000000 > >>> Aug 13 06:35:05 oberon kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > >>> Aug 13 06:35:05 oberon kernel: CR2: 00007f60c1d901a0 CR3: 0000000124f3a003 CR4: 00000000003706e0 > >>> Aug 13 06:35:05 oberon kernel: Kernel panic - not syncing: > >>> > >>> Config is attached. > >>> > >>> I was running the rteval script that kicks off parallel kernel builds and hackbench runs as loads and runs cyclictest with a thread on each core: > >>> > >>> $ sudo rteval --duration=10m > >>> > >>> Clark > >>> > >> > > > > > -- The United States Coast Guard Ruining Natural Selection since 1790
On Thu, 12 Aug 2021 23:36:48 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > On 8/12/21 10:18 PM, Clark Williams wrote: > > On Tue, 10 Aug 2021 18:37:31 +0200 > > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > > > Sebastian, et al, > > > > Got the following panic running v5.14-rc5-rt8: > > BTW, which was the last version that worked for you in this test? > The SLUB changes in rt8 should have been minimal, and related to > hotplug. On the other hand, if the previous working one was v5.14-rc4 > based, the problem could be in rc5... > I'm going to have to work my way backwards for that. I had a ton of warning splats showing up, mainly in kcov and i915 and fixed those before I started running rteval. I'll see if this showed up in rc4 or before but it may take me a couple of days. Clark -- The United States Coast Guard Ruining Natural Selection since 1790
On Thu, 2021-08-12 at 15:18 -0500, Clark Williams wrote: > > Sebastian, et al, > > Got the following panic running v5.14-rc5-rt8: > > ... > Config is attached. > > I was running the rteval script that kicks off parallel kernel builds > and hackbench runs as loads and runs cyclictest with a thread on each > core: > > $ sudo rteval --duration=10m It took my box more than 10 minutes to explode, but with all those debug options turned on, it did manage to do so in fairly short order. Off to build a fixed up 5.13-rt... [ 2081.451009] page:0000000039d9ce01 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x284580 [ 2081.451015] flags: 0x17fffc00000000(node=0|zone=2|lastcpupid=0x1ffff) [ 2081.451019] raw: 0017fffc00000000 ffffe1ddcb109a08 ffff8abb8dff5c80 0000000000000000 [ 2081.451021] raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 [ 2081.451022] page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) [ 2081.451031] ------------[ cut here ]------------ [ 2081.451032] kernel BUG at ./include/linux/page-flags.h:814! [ 2081.674089] invalid opcode: 0000 [#1] PREEMPT_RT SMP NOPTI [ 2081.674093] CPU: 0 PID: 32749 Comm: hackbench Kdump: loaded Tainted: G E 5.14.0.g607a4143-tip-rt_debug #5 [ 2081.674096] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013 [ 2081.674097] RIP: 0010:___slab_alloc+0x3f5/0x930 [ 2081.674102] Code: 4c 89 f8 0f 85 26 05 00 00 48 8d 65 d0 5b 41 5a 41 5c 41 5d 41 5e 41 5f 5d 49 8d 62 f8 c3 48 c7 c6 90 2e b7 a2 e8 5b d5 f9 ff <0f> 0b 65 8b 05 c2 fe 69 5e 48 98 8b 8d 70 ff ff ff 48 8d 95 78 ff [ 2081.674104] RSP: 0018:ffff97e8d2247a70 EFLAGS: 00010282 [ 2081.674107] RAX: 0000000000000034 RBX: ffff8ab880045900 RCX: 0000000000000001 [ 2081.674108] RDX: 0000000000000000 RSI: ffffffffa2b9ac2b RDI: 00000000ffffffff [ 2081.674110] RBP: ffff97e8d2247b40 R08: 0000000000000001 R09: 0000000000000001 [ 2081.674111] R10: ffff97e8d2247a70 R11: 0000000000000034 R12: ffff8abb8dff3130 [ 2081.674112] R13: ffffffffa2c23ac0 R14: 00000000ffffffff R15: 00000000001f3130 [ 2081.674114] FS: 00007f8c2f1c0740(0000) GS:ffff8abb8de00000(0000) knlGS:0000000000000000 [ 2081.674115] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2081.674117] CR2: 00007faca12850a0 CR3: 00000002c4e4c001 CR4: 00000000001706f0 [ 2081.674118] Call Trace: [ 2081.674122] ? lock_is_held_type+0xeb/0x140 [ 2081.674127] ? __alloc_skb+0x8c/0x1b0 [ 2081.674131] ? lock_release+0x289/0x430 [ 2081.674137] ? __alloc_skb+0x8c/0x1b0 [ 2081.674138] ? __alloc_skb+0x8c/0x1b0 [ 2081.674140] ? __slab_alloc.isra.79+0x45/0x60 [ 2081.674142] __slab_alloc.isra.79+0x45/0x60 [ 2081.674145] __kmalloc_node_track_caller+0xca/0x1d0 [ 2081.674149] kmalloc_reserve+0x2e/0x80 [ 2081.674153] __alloc_skb+0x8c/0x1b0 [ 2081.674156] alloc_skb_with_frags+0x53/0x1a0 [ 2081.674160] ? finish_wait+0x80/0x80 [ 2081.674164] sock_alloc_send_pskb+0x23b/0x270 [ 2081.674169] ? wait_for_unix_gc+0x42/0xb0 [ 2081.674175] unix_stream_sendmsg+0x209/0x3d0 [ 2081.674183] sock_sendmsg+0x58/0x60 [ 2081.674186] sock_write_iter+0x9a/0x100 [ 2081.674191] new_sync_write+0x1a2/0x1c0 [ 2081.674199] vfs_write+0x3b6/0x410 [ 2081.674204] ksys_write+0x53/0xe0 [ 2081.674206] ? lockdep_hardirqs_on+0x54/0x100 [ 2081.674209] do_syscall_64+0x37/0x80 [ 2081.674214] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 2081.674218] RIP: 0033:0x7f8c2ebd0e93 [ 2081.674221] Code: 75 05 48 83 c4 58 c3 e8 db 3c ff ff 66 2e 0f 1f 84 00 00 00 00 00 90 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 55 f3 c3 0f 1f 00 41 54 55 49 89 d4 53 48 89 [ 2081.674223] RSP: 002b:00007ffe616df358 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 2081.674225] RAX: ffffffffffffffda RBX: 00000000000003e8 RCX: 00007f8c2ebd0e93 [ 2081.674226] RDX: 00000000000003e8 RSI: 00007ffe616df360 RDI: 0000000000000013 [ 2081.674227] RBP: 00007ffe616df790 R08: 00007ffe616df2b0 R09: 00007f8c2edd9010 [ 2081.674228] R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffe616df360 [ 2081.674229] R13: 000000000000000b R14: 0000000001f90b30 R15: 0000000000000000 [ 2081.674236] Modules linked in: af_packet(E) nft_fib_inet(E) nft_fib_ipv4(E) nft_fib_ipv6(E) nft_fib(E) nft_reject_inet(E) nft_reject(E) nft_ct(E) nft_chain_nat(E) nf_tables(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) ip_set(E) bridge(E) nfnetlink(E) stp(E) llc(E) ebtable_filter(E) ebtables(E) iscsi_ibft(E) iscsi_boot_sysfs(E) ip6table_filter(E) ip6_tables(E) rfkill(E) iptable_filter(E) ip_tables(E) x_tables(E) bpfilter(E) nls_iso8859_1(E) nls_cp437(E) joydev(E) hid_logitech_hidpp(E) usblp(E) intel_rapl_msr(E) intel_rapl_common(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) kvm(E) irqbypass(E) crc32_pclmul(E) ghash_clmulni_intel(E) mei_hdcp(E) at24(E) regmap_i2c(E) aesni_intel(E) crypto_simd(E) cryptd(E) iTCO_wdt(E) intel_pmc_bxt(E) iTCO_vendor_support(E) [ 2081.674279] snd_hda_codec_realtek(E) snd_hda_codec_generic(E) ledtrig_audio(E) pcspkr(E) r8169(E) realtek(E) snd_hda_codec_hdmi(E) mdio_devres(E) libphy(E) thermal(E) snd_hda_intel(E) snd_intel_dspcfg(E) snd_intel_sdw_acpi(E) snd_hda_codec(E) snd_hda_core(E) snd_hwdep(E) snd_pcm(E) snd_timer(E) mei_me(E) i2c_i801(E) snd(E) lpc_ich(E) mei(E) i2c_smbus(E) intel_smartconnect(E) fan(E) mfd_core(E) soundcore(E) sch_fq_codel(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) fuse(E) configfs(E) sunrpc(E) hid_logitech_dj(E) ums_realtek(E) uas(E) usb_storage(E) hid_generic(E) usbhid(E) nouveau(E) drm_ttm_helper(E) ttm(E) i2c_algo_bit(E) mxm_wmi(E) wmi(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) fb_sys_fops(E) xhci_pci(E) cec(E) xhci_pci_renesas(E) ehci_pci(E) ahci(E) rc_core(E) ehci_hcd(E) libahci(E) xhci_hcd(E) drm(E) libata(E) usbcore(E) video(E) button(E) sd_mod(E) t10_pi(E) vfat(E) fat(E) virtio_blk(E) virtio_mmio(E) ext4(E) crc32c_intel(E) crc16(E) mbcache(E) [ 2081.674328] jbd2(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) msr(E) [ 2081.674335] Dumping ftrace buffer: [ 2081.674340] (ftrace buffer empty)
On Fri, 2021-08-13 at 12:56 +0200, Mike Galbraith wrote: > On Thu, 2021-08-12 at 15:18 -0500, Clark Williams wrote: > > > > Sebastian, et al, > > > > Got the following panic running v5.14-rc5-rt8: > > > > ... > > > Config is attached. > > > > I was running the rteval script that kicks off parallel kernel builds > > and hackbench runs as loads and runs cyclictest with a thread on each > > core: > > > > $ sudo rteval --duration=10m > > It took my box more than 10 minutes to explode, but with all those > debug options turned on, it did manage to do so in fairly short order. > > Off to build a fixed up 5.13-rt... ...and it reproduced. Transplanted 5.12-rt slub seems stable in both 5.1[34]-rt trees FWIW. -Mike
On 8/14/21 7:33 AM, Mike Galbraith wrote: > On Fri, 2021-08-13 at 12:56 +0200, Mike Galbraith wrote: >> On Thu, 2021-08-12 at 15:18 -0500, Clark Williams wrote: >>> >>> Sebastian, et al, >>> >>> Got the following panic running v5.14-rc5-rt8: >>> >>> ... >> >>> Config is attached. >>> >>> I was running the rteval script that kicks off parallel kernel builds >>> and hackbench runs as loads and runs cyclictest with a thread on each >>> core: >>> >>> $ sudo rteval --duration=10m >> >> It took my box more than 10 minutes to explode, but with all those >> debug options turned on, it did manage to do so in fairly short order. >> >> Off to build a fixed up 5.13-rt... > > ...and it reproduced. Transplanted 5.12-rt slub seems stable in both > 5.1[34]-rt trees FWIW. Why didn't you see it in earlier testing though? What's different now. > -Mike > >
On 8/12/21 11:44 PM, Clark Williams wrote: > On Thu, 12 Aug 2021 23:30:29 +0200 > Vlastimil Babka <vbabka@suse.cz> wrote: > >> On 8/12/21 11:24 PM, Clark Williams wrote: >>> On Thu, 12 Aug 2021 22:45:19 +0200 >>> Vlastimil Babka <vbabka@suse.cz> wrote: >>> >>>> On 8/12/21 10:18 PM, Clark Williams wrote: >>>>> On Tue, 10 Aug 2021 18:37:31 +0200 >>>>> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: >>>>> >>>>> Sebastian, et al, >>>>> >>>>> Got the following panic running v5.14-rc5-rt8: >>>>> >>>>> Aug 13 06:35:05 oberon kernel: page:000000009ac5dd73 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1ab3db >>>>> Aug 13 06:35:05 oberon kernel: flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff) >>>>> Aug 13 06:35:05 oberon kernel: raw: 0017ffffc0000000 ffffee1286aceb88 ffffee1287b66288 0000000000000000 >>>>> Aug 13 06:35:05 oberon kernel: raw: 0000000000000000 0000000000100000 00000000ffffffff 0000000000000000 >>>>> Aug 13 06:35:05 oberon kernel: page dumped because: VM_BUG_ON_PAGE(!PageSlab(page)) >>>>> Aug 13 06:35:05 oberon kernel: ------------[ cut here ]------------ >>>>> Aug 13 06:35:05 oberon kernel: kernel BUG at include/linux/page-flags.h:814! >>>>> Aug 13 06:35:05 oberon kernel: invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI >>>>> Aug 13 06:35:05 oberon kernel: CPU: 3 PID: 12345 Comm: hackbench Not tainted 5.14.0-rc5-rt8+ #12 >>>>> Aug 13 06:35:05 oberon kernel: Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0359.2016.0906.1028 09/06/2016 >>>>> Aug 13 06:35:05 oberon kernel: RIP: 0010:___slab_alloc+0x340/0x940 >>>> >>>> Are you able to translate this RIP via addr2line? >>> >>> $ addr2line -e /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 >>> <snip>/arch/x86/include/asm/processor.h:440 >> >> Hm that's not much useful, I'd need the line in mm/slub.c >> does ./scripts/faddr2line give better output? > > Why, yes it does! :) > > $ ./scripts/faddr2line /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 > ___slab_alloc+0x340/0x940: > PageSlabPfmemalloc at include/linux/page-flags.h:814 > (inlined by) pfmemalloc_match at mm/slub.c:2772 > (inlined by) ___slab_alloc at mm/slub.c:2874 Aha! That's helpful. Hopefully it's just a small issue where we opportunistically test flags on a page that's protected by the local lock we didn't take yet, and I didn't realize there's the VM_BUG_ON which can trigger if our page went away (which we would have realized after taking the lock). So hopefully the below diff with uninspired naming should help? ----8<---- diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..24579f71001e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -815,6 +815,11 @@ static inline int PageSlabPfmemalloc(struct page *page) return PageActive(page); } +static inline int __PageSlabPfmemalloc(struct page *page) +{ + return PageActive(page); +} + static inline void SetPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); diff --git a/mm/slub.c b/mm/slub.c index ef022fe159c6..3cc7d58a08fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2775,6 +2775,14 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) return true; } +static inline bool try_pfmemalloc_match(struct page *page, gfp_t gfpflags) +{ + if (unlikely(__PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + /* * Check the page->freelist of a page and either transfer the freelist to the * per cpu freelist or deactivate the page. @@ -2871,7 +2879,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(page, gfpflags))) + if (unlikely(!try_pfmemalloc_match(page, gfpflags))) goto deactivate_slab; /* must check again c->page in case we got preempted and it changed */
On Sat, 2021-08-14 at 20:28 +0200, Vlastimil Babka wrote: > > Why didn't you see it in earlier testing though? What's different now. The provided debug options make the difference. Without that pile, an otherwise identical config will happily slog away for hours, add them, it's a matter of minutes. -Mike
On Sat, 2021-08-14 at 21:08 +0200, Vlastimil Babka wrote: > On 8/12/21 11:44 PM, Clark Williams wrote: > > > > $ ./scripts/faddr2line /data/o/linux-5.14.y-rt/vmlinux ___slab_alloc+0x340/0x940 > > ___slab_alloc+0x340/0x940: > > PageSlabPfmemalloc at include/linux/page-flags.h:814 > > (inlined by) pfmemalloc_match at mm/slub.c:2772 > > (inlined by) ___slab_alloc at mm/slub.c:2874 > > Aha! That's helpful. Hopefully it's just a small issue where we > opportunistically test flags on a page that's protected by the local > lock we didn't take yet, and I didn't realize there's the VM_BUG_ON > which can trigger if our page went away (which we would have realized > after taking the lock). > > So hopefully the below diff with uninspired naming should help? I bet a nickle it does, shall let box slave away testing that theory while its master pedals around the sunny Bavarian countryside. -Mike
On Sat, 2021-08-14 at 21:08 +0200, Vlastimil Babka wrote: > > Aha! That's helpful. Hopefully it's just a small issue where we > opportunistically test flags on a page that's protected by the local > lock we didn't take yet, and I didn't realize there's the VM_BUG_ON > which can trigger if our page went away (which we would have realized > after taking the lock). Speaking of optimistic peeking perhaps going badly, why is the below not true? There's protection against ->partial going disappearing during a preemption... but can't it just as easily appear, so where is that protection? If the other side of that window is safe, it could use a comment so dummies reading this code don't end up asking mm folks why the heck they don't just take the darn lock and be done with it instead of tap dancing all around thething :) --- mm/slub.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) --- a/mm/slub.c +++ b/mm/slub.c @@ -2937,17 +2937,16 @@ static void *___slab_alloc(struct kmem_c new_slab: + /* + * To avoid false negative race with put_cpu_partial() during a + * preemption, we must call slub_percpu_partial() under lock. + */ + local_lock_irqsave(&s->cpu_slab->lock, flags); if (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->page)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } - if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - /* we were preempted and partial list got empty */ - goto new_objects; - } page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); @@ -2955,8 +2954,7 @@ static void *___slab_alloc(struct kmem_c stat(s, CPU_PARTIAL_ALLOC); goto redo; } - -new_objects: + local_unlock_irqrestore(&s->cpu_slab->lock, flags); freelist = get_partial(s, gfpflags, node, &page); if (freelist)
On Sun, 2021-08-15 at 05:13 +0200, Mike Galbraith wrote: > > > > So hopefully the below diff with uninspired naming should help? > > I bet a nickle it does... And it did. -Mike
On 8/15/21 6:17 AM, Mike Galbraith wrote: > On Sat, 2021-08-14 at 21:08 +0200, Vlastimil Babka wrote: >> >> Aha! That's helpful. Hopefully it's just a small issue where we >> opportunistically test flags on a page that's protected by the local >> lock we didn't take yet, and I didn't realize there's the VM_BUG_ON >> which can trigger if our page went away (which we would have realized >> after taking the lock). > > Speaking of optimistic peeking perhaps going badly, why is the below > not true? There's protection against ->partial going disappearing > during a preemption... but can't it just as easily appear, so where is > that protection? If it appears, it appears, we don't care, we just leave it there and won't use it. > If the other side of that window is safe, it could use a comment so > dummies reading this code don't end up asking mm folks why the heck > they don't just take the darn lock and be done with it instead of tap > dancing all around thething :) Well, with your patch, ->partial might appear just after the unlock, so does that really change anything? The point is to avoid the taking the lock if it's almost certain there will be nothing to gain. c->partial appearing is easy to just ignore. c->page appearing, while we got our own page, is worse as there can be only one c->page. But it's unavoidable, we can't just keep holding the local lock while going to the page allocator etc. That's why under retry_load_page: we have to deactivate a c->page that appeared under us... > --- > mm/slub.c | 14 ++++++-------- > 1 file changed, 6 insertions(+), 8 deletions(-) > > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -2937,17 +2937,16 @@ static void *___slab_alloc(struct kmem_c > > new_slab: > > + /* > + * To avoid false negative race with put_cpu_partial() during a > + * preemption, we must call slub_percpu_partial() under lock. > + */ > + local_lock_irqsave(&s->cpu_slab->lock, flags); > if (slub_percpu_partial(c)) { > - local_lock_irqsave(&s->cpu_slab->lock, flags); > if (unlikely(c->page)) { > local_unlock_irqrestore(&s->cpu_slab->lock, flags); > goto reread_page; > } > - if (unlikely(!slub_percpu_partial(c))) { > - local_unlock_irqrestore(&s->cpu_slab->lock, flags); > - /* we were preempted and partial list got empty */ > - goto new_objects; > - } > > page = c->page = slub_percpu_partial(c); > slub_set_percpu_partial(c, page); > @@ -2955,8 +2954,7 @@ static void *___slab_alloc(struct kmem_c > stat(s, CPU_PARTIAL_ALLOC); > goto redo; > } > - > -new_objects: > + local_unlock_irqrestore(&s->cpu_slab->lock, flags); > > freelist = get_partial(s, gfpflags, node, &page); > if (freelist) > >
On Sun, 2021-08-15 at 11:35 +0200, Vlastimil Babka wrote: > On 8/15/21 6:17 AM, Mike Galbraith wrote: > > On Sat, 2021-08-14 at 21:08 +0200, Vlastimil Babka wrote: > > > > > > Aha! That's helpful. Hopefully it's just a small issue where we > > > opportunistically test flags on a page that's protected by the local > > > lock we didn't take yet, and I didn't realize there's the VM_BUG_ON > > > which can trigger if our page went away (which we would have realized > > > after taking the lock). > > > > Speaking of optimistic peeking perhaps going badly, why is the below > > not true? There's protection against ->partial going disappearing > > during a preemption... but can't it just as easily appear, so where is > > that protection? > > If it appears, it appears, we don't care, we just leave it there and > won't use it. > > > If the other side of that window is safe, it could use a comment so > > dummies reading this code don't end up asking mm folks why the heck > > they don't just take the darn lock and be done with it instead of tap > > dancing all around thething :) > > Well, with your patch, ->partial might appear just after the unlock, so > does that really change anything? Viewed from pov consumption is optional, it makes sense. -Mike
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 65fa7498a4d2c..531cb503d4c49 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -16,6 +16,7 @@ #include <linux/linkage.h> #include <linux/rbtree_types.h> #include <linux/spinlock_types_raw.h> +#include <linux/compiler.h> extern int max_lock_depth; /* for sysctl */ @@ -40,7 +41,7 @@ struct rt_mutex_base { */ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) { - return lock->owner != NULL; + return READ_ONCE(lock->owner) != NULL; } extern void rt_mutex_base_init(struct rt_mutex_base *rtb); diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 7ed2a33df7cc7..4fc72199cc9d2 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -112,7 +112,7 @@ static __always_inline void spin_unlock_irq(spinlock_t *lock) static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { - spin_unlock(lock); + rt_spin_unlock(lock); } #define spin_trylock(lock) \ diff --git a/kernel/futex.c b/kernel/futex.c index 41e3d63160a78..fcc0570868b7b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1357,27 +1357,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 __user *uaddr, u32 uval, - struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps, - struct task_struct **exiting) -{ - struct futex_q *top_waiter = futex_top_waiter(hb, key); - - /* - * If there is a waiter on that futex, validate it and - * attach to the pi_state when the validation succeeds. - */ - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); - - /* - * We are the first waiter - try to look up the owner based on - * @uval and attach to it. - */ - return attach_to_pi_owner(uaddr, uval, key, ps, exiting); -} - static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; @@ -2134,13 +2113,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, if (uaddr1 == uaddr2) return -EINVAL; - /* - * requeue_pi requires a pi_state, try to allocate it now - * without any locks in case it fails. - */ - if (refill_pi_state_cache()) - return -ENOMEM; - /* * futex_requeue() allows the caller to define the number * of waiters to wake up via the @nr_wake argument. With @@ -2164,6 +2136,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ if (nr_wake != 1) return -EINVAL; + + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; } retry: @@ -2213,7 +2192,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } } - if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + if (requeue_pi) { struct task_struct *exiting = NULL; /* @@ -2232,18 +2211,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, ret contains the - * vpid of the top waiter task. + * reference to it. If the lock was taken, @ret contains the + * VPID of the top waiter task. * If the lock was not taken, we have pi_state and an initial * refcount on it. In case of an error we have nothing. * * The top waiter's requeue_state is up to date: * * - If the lock was acquired atomically (ret > 0), then - * the state is Q_REQUEUE_PI_LOCKED. No matter whether - * the below lookup_pi_state() fails or not requeue_state - * is correct because that waiter is dequeued and woken - * up and nothing can hold it up. + * the state is Q_REQUEUE_PI_LOCKED. * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing @@ -2262,19 +2238,25 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, WARN_ON(pi_state); task_count++; /* - * If we acquired the lock, then the user space value - * of uaddr2 should be vpid. It cannot be changed by - * the top waiter as it is blocked on hb2 lock if it - * tries to do so. If something fiddled with it behind - * our back the pi state lookup might unearth it. So - * we rather use the known value than rereading and - * handing potential crap to lookup_pi_state. + * If futex_proxy_trylock_atomic() acquired the + * user space futex, then the user space value + * @uaddr2 has been set to the @hb1's top waiter + * task VPID. This task is guaranteed to be alive + * and cannot be exiting because it is either + * sleeping or blocked on @hb2 lock. * - * If that call succeeds then we have pi_state and an - * initial refcount on it. + * The @uaddr2 futex cannot have waiters either as + * otherwise futex_proxy_trylock_atomic() would not + * have succeeded. + * + * In order to requeue waiters to @hb2, pi state is + * required. Hand in the VPID value (@ret) and + * allocate PI state with an initial refcount on + * it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, - &pi_state, &exiting); + ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, + &exiting); + WARN_ON(ret); } switch (ret) { @@ -2413,9 +2395,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } /* - * We took an extra initial reference to the pi_state either - * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We - * need to drop it here again. + * We took an extra initial reference to the pi_state either in + * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need + * to drop it here again. */ put_pi_state(pi_state); @@ -2594,7 +2576,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * Modifying pi_state _before_ the user space value would leave the * pi_state in an inconsistent state when we fault here, because we * need to drop the locks to handle the fault. This might be observed - * in the PID check in lookup_pi_state. + * in the PID checks when attaching to PI state . */ retry: if (!argowner) { diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7522c3abacb6c..44472115aaf66 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -284,11 +284,28 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, } #endif +static __always_inline int __waiter_prio(struct task_struct *task) +{ + int prio = task->prio; + + if (!rt_prio(prio)) + return DEFAULT_PRIO; + + return prio; +} + +static __always_inline void +waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + waiter->prio = __waiter_prio(task); + waiter->deadline = task->dl.deadline; +} + /* * Only use with rt_mutex_waiter_{less,equal}() */ #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -356,11 +373,15 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod if (rt_mutex_waiter_less(aw, bw)) return 1; + + if (!build_ww_mutex()) + return 0; + if (rt_mutex_waiter_less(bw, aw)) return 0; /* NOTE: relies on waiter->ww_ctx being set before insertion */ - if (build_ww_mutex() && aw->ww_ctx) { + if (aw->ww_ctx) { if (!bw->ww_ctx) return 1; @@ -775,8 +796,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * serializes all pi_waiters access and rb_erase() does not care about * the values of the node being removed. */ - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); @@ -1045,8 +1065,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, raw_spin_lock(&task->pi_lock); waiter->task = task; waiter->lock = lock; - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -1284,27 +1303,34 @@ static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) } #ifdef CONFIG_SMP -/* - * Note that owner is a speculative pointer and dereferencing relies - * on rcu_read_lock() and the check against the lock owner. - */ -static bool rtmutex_adaptive_spinwait(struct rt_mutex_base *lock, - struct task_struct *owner) +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { bool res = true; rcu_read_lock(); for (;;) { - /* Owner changed. Trylock again */ + /* If owner changed, trylock again. */ if (owner != rt_mutex_owner(lock)) break; /* - * Ensure that owner->on_cpu is dereferenced _after_ - * checking the above to be valid. + * Ensure that @owner is dereferenced after checking that + * the lock owner still matches @owner. If that fails, + * @owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. */ barrier(); - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + /* + * Stop spinning when: + * - the lock owner has been scheduled out + * - current is not longer the top waiter + * - current is requested to reschedule (redundant + * for CONFIG_PREEMPT_RCU=y) + * - the VCPU on which owner runs is preempted + */ + if (!owner->on_cpu || waiter != rt_mutex_top_waiter(lock) || + need_resched() || vcpu_is_preempted(task_cpu(owner))) { res = false; break; } @@ -1314,8 +1340,9 @@ static bool rtmutex_adaptive_spinwait(struct rt_mutex_base *lock, return res; } #else -static bool rtmutex_adaptive_spinwait(struct rt_mutex_base *lock, - struct task_struct *owner) +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { return false; } @@ -1434,7 +1461,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq(&lock->wait_lock); - if (!owner || !rtmutex_adaptive_spinwait(lock, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) schedule(); raw_spin_lock_irq(&lock->wait_lock); @@ -1616,7 +1643,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) owner = NULL; raw_spin_unlock_irq(&lock->wait_lock); - if (!owner || !rtmutex_adaptive_spinwait(lock, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) schedule_rtlock(); raw_spin_lock_irq(&lock->wait_lock); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 3ca0f167df544..a077079e387ca 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) int a_prio = a->task->prio; int b_prio = b->task->prio; - if (dl_prio(a_prio) || dl_prio(b_prio)) { + if (rt_prio(a_prio) || rt_prio(b_prio)) { if (a_prio > b_prio) return true; diff --git a/localversion-rt b/localversion-rt index 045478966e9f1..700c857efd9ba 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt7 +-rt8 diff --git a/mm/slab_common.c b/mm/slab_common.c index 1c673c323baf2..ec2bb0beed757 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; + cpus_read_lock(); mutex_lock(&slab_mutex); s->refcount--; @@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cache *s) } out_unlock: mutex_unlock(&slab_mutex); + cpus_read_unlock(); } EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/mm/slub.c b/mm/slub.c index 5d775fafd2160..ef022fe159c65 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -463,7 +463,8 @@ static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *pag } else #endif { - unsigned long flags; + /* init to 0 to prevent spurious warnings */ + unsigned long flags = 0; __slab_lock(page, &flags, disable_irqs); if (page->freelist == freelist_old && @@ -2636,13 +2637,13 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) static DEFINE_MUTEX(flush_lock); static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); -static void flush_all(struct kmem_cache *s) +static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; unsigned int cpu; + lockdep_assert_cpus_held(); mutex_lock(&flush_lock); - cpus_read_lock(); for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); @@ -2663,10 +2664,16 @@ static void flush_all(struct kmem_cache *s) flush_work(&sfw->work); } - cpus_read_unlock(); mutex_unlock(&flush_lock); } +static void flush_all(struct kmem_cache *s) +{ + cpus_read_lock(); + flush_all_cpus_locked(s); + cpus_read_unlock(); +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -4236,7 +4243,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) int node; struct kmem_cache_node *n; - flush_all(s); + flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { free_partial(s, n); @@ -4512,7 +4519,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_do_shrink(struct kmem_cache *s) { int node; int i; @@ -4524,7 +4531,6 @@ int __kmem_cache_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) @@ -4574,13 +4580,21 @@ int __kmem_cache_shrink(struct kmem_cache *s) return ret; } +int __kmem_cache_shrink(struct kmem_cache *s) +{ + flush_all(s); + return __kmem_cache_do_shrink(s); +} + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + list_for_each_entry(s, &slab_caches, list) { + flush_all_cpus_locked(s); + __kmem_cache_do_shrink(s); + } mutex_unlock(&slab_mutex); return 0;