Message ID | 20220309223431.26560-1-chang.seok.bae@intel.com |
---|---|
Headers | show |
Series | x86/fpu: Make AMX state ready for CPU idle | expand |
On 3/9/22 14:34, Chang S. Bae wrote: > +/* > + * Initialize register state that may prevent from entering low-power idle. > + * This function will be invoked from the cpuidle driver only when needed. > + */ > +void fpu_idle_fpregs(void) > +{ > + if (!fpu_state_size_dynamic()) > + return; Is this check just an optimization? I'm having trouble imagining a situation where we would have: (xfeatures_in_use() & XFEATURE_MASK_XTILE) == true but fpu_state_size_dynamic() == false > + if (xfeatures_in_use() & XFEATURE_MASK_XTILE) { > + tile_release(); > + fpregs_deactivate(¤t->thread.fpu); > + } > +} xfeatures_in_use() isn't exactly expensive either.
On 3/9/2022 2:46 PM, Dave Hansen wrote: > On 3/9/22 14:34, Chang S. Bae wrote: >> +/* >> + * Initialize register state that may prevent from entering low-power idle. >> + * This function will be invoked from the cpuidle driver only when needed. >> + */ >> +void fpu_idle_fpregs(void) >> +{ >> + if (!fpu_state_size_dynamic()) >> + return; > > Is this check just an optimization? No. 0day reported the splat [3] with the earlier code in v1 [1]: if (fpu_state_size_dynamic() && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { ... } It looks like GCC-9 reordered to hit XGETBV without checking fpu_state_size_dynamic(). So this line was separated to avoid that. > I'm having trouble imagining a situation where we would have: > > (xfeatures_in_use() & XFEATURE_MASK_XTILE) == true > but > fpu_state_size_dynamic() == false Yes, I don't think we have such situation in practice. > >> + if (xfeatures_in_use() & XFEATURE_MASK_XTILE) { >> + tile_release(); >> + fpregs_deactivate(¤t->thread.fpu); >> + } >> +} > > xfeatures_in_use() isn't exactly expensive either. True. Thanks, Chang [1] https://lore.kernel.org/lkml/20211104225226.5031-3-chang.seok.bae@intel.com/ [2] https://lore.kernel.org/lkml/20211104225226.5031-4-chang.seok.bae@intel.com/ [3] 0-day report: FYI, we noticed the following commit (built with gcc-9): commit: 66951fd5ad7a2aabae7dc54be46a4eac667a082c ("x86/fpu: Prepare AMX state for CPU idle") internal-devel devel-hourly-20220113-144327 [chang: this refers to [2]. ] ... on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace): [ 1.767825][ T1] x86: Booting SMP configuration: [ 1.768363][ T1] .... node #0, CPUs: #1 [ 0.113687][ T0] kvm-clock: cpu 1, msr 38a24a041, secondary cpu clock [ 0.113687][ T0] masked ExtINT on CPU#1 [ 0.113687][ T0] smpboot: CPU 1 Converting physical 0 to logical die 1 [ 1.780411][ T0] general protection fault, maybe for address 0x1: 0000 [#1] SMP KASAN PTI [ 1.781350][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-00004-g66951fd5ad7a #1 [ 1.781350][ T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 1.781350][ T0] RIP: 0010:fpu_idle_fpregs (kbuild/src/consumer/arch/x86/include/asm/fpu/xcr.h:12 kbuild/src/consumer/arch/x86/include/asm/fpu/xcr.h:32 kbuild/src/consumer/arch/x86/kernel/fpu/core.c:772) [ 1.781350][ T0] Code: 89 74 24 04 e8 d4 6e 82 00 8b 74 24 04 e9 f8 fe ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 eb 01 c3 b9 01 00 00 00 <0f> 01 d0 a9 00 00 06 00 75 01 c3 55 53 c4 e2 78 49 c0 65 48 8b 2c All code ======== 0: 89 74 24 04 mov %esi,0x4(%rsp) 4: e8 d4 6e 82 00 callq 0x826edd 9: 8b 74 24 04 mov 0x4(%rsp),%esi d: e9 f8 fe ff ff jmpq 0xffffffffffffff0a 12: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) 19: 00 00 00 00 1d: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 22: eb 01 jmp 0x25 24: c3 retq 25: b9 01 00 00 00 mov $0x1,%ecx 2a:* 0f 01 d0 xgetbv <-- trapping instruction 2d: a9 00 00 06 00 test $0x60000,%eax 32: 75 01 jne 0x35 34: c3 retq 35: 55 push %rbp 36: 53 push %rbx 37: c4 e2 78 49 (bad) 3b: c0 65 48 8b shlb $0x8b,0x48(%rbp) 3f: 2c .byte 0x2c Code starting with the faulting instruction =========================================== 0: 0f 01 d0 xgetbv 3: a9 00 00 06 00 test $0x60000,%eax 8: 75 01 jne 0xb a: c3 retq b: 55 push %rbp c: 53 push %rbx d: c4 e2 78 49 (bad) 11: c0 65 48 8b shlb $0x8b,0x48(%rbp) 15: 2c .byte 0x2c [ 1.781350][ T0] RSP: 0000:ffffffff96007e50 EFLAGS: 00010047 [ 1.781350][ T0] RAX: 0000000000000001 RBX: ffffffff96022280 RCX: 0000000000000001 [ 1.781350][ T0] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff96db7da0 [ 1.781350][ T0] RBP: 0000000000000000 R08: 0000000000000001 R09: fffffbfff2db6fb5 [ 1.781350][ T0] R10: ffffffff96db7da7 R11: fffffbfff2db6fb4 R12: fffffbfff2c04450 [ 1.781350][ T0] R13: ffffffff96db7da0 R14: dffffc0000000000 R15: 0000000000000000 [ 1.781350][ T0] FS: 0000000000000000(0000) GS:ffff88839d600000(0000) knlGS:0000000000000000 [ 1.781350][ T0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.781350][ T0] CR2: ffff88843ffff000 CR3: 0000000388a14000 CR4: 00000000000406f0 [ 1.781350][ T0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1.781350][ T0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1.781350][ T0] Call Trace: [ 1.781350][ T0] <TASK> [ 1.781350][ T0] ? arch_cpu_idle_enter (kbuild/src/consumer/arch/x86/kernel/process.c:712) [ 1.781350][ T0] ? do_idle (kbuild/src/consumer/kernel/sched/idle.c:294) [ 1.781350][ T0] ? _raw_read_unlock_irqrestore (kbuild/src/consumer/kernel/locking/spinlock.c:161) [ 1.781350][ T0] ? arch_cpu_idle_exit+0xc0/0xc0 [ 1.781350][ T0] ? schedule (kbuild/src/consumer/arch/x86/include/asm/bitops.h:207 (discriminator 1) kbuild/src/consumer/include/asm-generic/bitops/instrumented-non-atomic.h:135 (discriminator 1) kbuild/src/consumer/include/linux/thread_info.h:118 (discriminator 1) kbuild/src/consumer/include/linux/sched.h:2120 (discriminator 1) kbuild/src/consumer/kernel/sched/core.c:6328 (discriminator 1)) [ 1.781350][ T0] ? cpu_startup_entry (kbuild/src/consumer/kernel/sched/idle.c:402 (discriminator 1)) [ 1.781350][ T0] ? start_kernel (kbuild/src/consumer/init/main.c:1137) [ 1.781350][ T0] ? secondary_startup_64_no_verify (kbuild/src/consumer/arch/x86/kernel/head_64.S:283) [ 1.781350][ T0] </TASK> [ 1.781350][ T0] Modules linked in: [ 1.781350][ T0] ---[ end trace 2bef003d678c9f1a ]--- [ 1.781350][ T0] general protection fault, maybe for address 0x1: 0000 [#2] SMP KASAN PTI [ 1.781350][ T0] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G D 5.16.0-00004-g66951fd5ad7a #1 [ 1.781350][ T0] RIP: 0010:fpu_idle_fpregs (kbuild/src/consumer/arch/x86/include/asm/fpu/xcr.h:12 kbuild/src/consumer/arch/x86/include/asm/fpu/xcr.h:32 kbuild/src/consumer/arch/x86/kernel/fpu/core.c:772) [ 1.781350][ T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 1.781350][ T0] Code: 89 74 24 04 e8 d4 6e 82 00 8b 74 24 04 e9 f8 fe ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 eb 01 c3 b9 01 00 00 00 <0f> 01 d0 a9 00 00 06 00 75 01 c3 55 53 c4 e2 78 49 c0 65 48 8b 2c All code ======== 0: 89 74 24 04 mov %esi,0x4(%rsp) 4: e8 d4 6e 82 00 callq 0x826edd 9: 8b 74 24 04 mov 0x4(%rsp),%esi d: e9 f8 fe ff ff jmpq 0xffffffffffffff0a 12: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) 19: 00 00 00 00 1d: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 22: eb 01 jmp 0x25 24: c3 retq 25: b9 01 00 00 00 mov $0x1,%ecx 2a:* 0f 01 d0 xgetbv <-- trapping instruction 2d: a9 00 00 06 00 test $0x60000,%eax 32: 75 01 jne 0x35 34: c3 retq 35: 55 push %rbp 36: 53 push %rbx 37: c4 e2 78 49 (bad) 3b: c0 65 48 8b shlb $0x8b,0x48(%rbp) 3f: 2c .byte 0x2c Code starting with the faulting instruction =========================================== 0: 0f 01 d0 xgetbv 3: a9 00 00 06 00 test $0x60000,%eax 8: 75 01 jne 0xb a: c3 retq b: 55 push %rbp c: 53 push %rbx d: c4 e2 78 49 (bad) 11: c0 65 48 8b shlb $0x8b,0x48(%rbp) 15: 2c .byte 0x2c [ 1.781350][ T0] RIP: 0010:fpu_idle_fpregs (kbuild/src/consumer/arch/x86/include/asm/fpu/xcr.h:12 kbuild/src/consumer/arch/x86/include/asm/fpu/xcr.h:32 kbuild/src/consumer/arch/x86/kernel/fpu/core.c:772) [ 1.781350][ T0] RSP: 0000:ffffffff96007e50 EFLAGS: 00010047 [ 1.781350][ T0] Code: 89 74 24 04 e8 d4 6e 82 00 8b 74 24 04 e9 f8 fe ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 eb 01 c3 b9 01 00 00 00 <0f> 01 d0 a9 00 00 06 00 75 01 c3 55 53 c4 e2 78 49 c0 65 48 8b 2c All code ======== 0: 89 74 24 04 mov %esi,0x4(%rsp) 4: e8 d4 6e 82 00 callq 0x826edd 9: 8b 74 24 04 mov 0x4(%rsp),%esi d: e9 f8 fe ff ff jmpq 0xffffffffffffff0a 12: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) 19: 00 00 00 00 1d: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 22: eb 01 jmp 0x25 24: c3 retq 25: b9 01 00 00 00 mov $0x1,%ecx 2a:* 0f 01 d0 xgetbv <-- trapping instruction 2d: a9 00 00 06 00 test $0x60000,%eax 32: 75 01 jne 0x35 34: c3 retq 35: 55 push %rbp 36: 53 push %rbx 37: c4 e2 78 49 (bad) 3b: c0 65 48 8b shlb $0x8b,0x48(%rbp) 3f: 2c .byte 0x2c Code starting with the faulting instruction =========================================== 0: 0f 01 d0 xgetbv 3: a9 00 00 06 00 test $0x60000,%eax 8: 75 01 jne 0xb a: c3 retq b: 55 push %rbp c: 53 push %rbx d: c4 e2 78 49 (bad) 11: c0 65 48 8b shlb $0x8b,0x48(%rbp) 15: 2c .byte 0x2c [ 1.781350][ T0] [ 1.781350][ T0] RSP: 0000:ffffc9000010fdf8 EFLAGS: 00010047 [ 1.781350][ T0] RAX: 0000000000000001 RBX: ffffffff96022280 RCX: 0000000000000001 [ 1.781350][ T0] [ 1.781350][ T0] RAX: 0000000000000001 RBX: ffff8881003e0000 RCX: 0000000000000001 [ 1.781350][ T0] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff96db7da0 [ 1.781350][ T0] RBP: 0000000000000000 R08: 0000000000000001 R09: fffffbfff2db6fb5 [ 1.781350][ T0] RBP: 0000000000000001 R08: 0000000000000001 R09: fffffbfff2db6fb5 [ 1.781350][ T0] R10: ffffffff96db7da7 R11: fffffbfff2db6fb4 R12: fffffbfff2c04450 [ 1.781350][ T0] R10: ffffffff96db7da7 R11: fffffbfff2db6fb4 R12: ffffed102007c000 [ 1.781350][ T0] R13: ffffffff96db7da0 R14: dffffc0000000000 R15: 0000000000000000 [ 1.781350][ T0] FS: 0000000000000000(0000) GS:ffff88839d600000(0000) knlGS:0000000000000000 [ 1.781350][ T0] FS: 0000000000000000(0000) GS:ffff88839d700000(0000) knlGS:0000000000000000 [ 1.781350][ T0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.781350][ T0] CR2: ffff88843ffff000 CR3: 0000000388a14000 CR4: 00000000000406f0 [ 1.781350][ T0] CR2: 0000000000000000 CR3: 0000000388a14000 CR4: 00000000000406e0 [ 1.781350][ T0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1.781350][ T0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1.781350][ T0] Kernel panic - not syncing: Fatal exception [ 1.781350][ T0] Call Trace: [ 1.781350][ T0] <TASK> [ 1.781350][ T0] ? arch_cpu_idle_enter (kbuild/src/consumer/arch/x86/kernel/process.c:712) [ 1.781350][ T0] ? do_idle (kbuild/src/consumer/kernel/sched/idle.c:294) [ 1.781350][ T0] ? _raw_spin_lock_irqsave (kbuild/src/consumer/arch/x86/include/asm/atomic.h:202 kbuild/src/consumer/include/linux/atomic/atomic-instrumented.h:513 kbuild/src/consumer/include/asm-generic/qspinlock.h:82 kbuild/src/consumer/include/linux/spinlock.h:185 kbuild/src/consumer/include/linux/spinlock_api_smp.h:111 kbuild/src/consumer/kernel/locking/spinlock.c:162) [ 1.781350][ T0] ? arch_cpu_idle_exit+0xc0/0xc0
On 3/9/22 15:12, Chang S. Bae wrote: > On 3/9/2022 2:46 PM, Dave Hansen wrote: >> On 3/9/22 14:34, Chang S. Bae wrote: >>> +/* >>> + * Initialize register state that may prevent from entering >>> low-power idle. >>> + * This function will be invoked from the cpuidle driver only when >>> needed. >>> + */ >>> +void fpu_idle_fpregs(void) >>> +{ >>> + if (!fpu_state_size_dynamic()) >>> + return; >> >> Is this check just an optimization? > > No. 0day reported the splat [3] with the earlier code in v1 [1]: > > if (fpu_state_size_dynamic() && (xfeatures_in_use() & > XFEATURE_MASK_XTILE)) { ... } > > It looks like GCC-9 reordered to hit XGETBV without checking > fpu_state_size_dynamic(). So this line was separated to avoid that. I assume that splat is because 0day found a CPU which doesn't support XGETBV1. Since fpu_state_size_dynamic() only ever returns true on XGETBV1 systems so it works as a proxy for checking XGETBV1 support. Right? If so, then fpu_state_size_dynamic() is a *bit* of an oblique way to check for XGETBV1 support. Why don't we do a good old: cpu_feature_enabled(X86_FEATURE_XGETBV1) check? Also, did we get the asm constraints wrong on xgetbv()? Surely we shouldn't be allowing the compiler to reorder it. Do we need a "memory" constraint?
On 3/9/2022 4:24 PM, Dave Hansen wrote: > > I assume that splat is because 0day found a CPU which doesn't support > XGETBV1. Since fpu_state_size_dynamic() only ever returns true on > XGETBV1 systems so it works as a proxy for checking XGETBV1 support. > > Right? > > If so, then fpu_state_size_dynamic() is a *bit* of an oblique way to > check for XGETBV1 support. > > Why don't we do a good old: > > cpu_feature_enabled(X86_FEATURE_XGETBV1) > > check? Agreed, checking XGETBV1 support is the reason for this, so this looks to be straightforward here. > > Also, did we get the asm constraints wrong on xgetbv()? Surely we > shouldn't be allowing the compiler to reorder it. Do we need a "memory" > constraint? I think this is a good point. Perhaps x{get|set}bv() may follow this change [1] to prevent any reordering. BTW, now I'm suspicious of this JMP as patched at runtime with fpu_state_size_dynamic(): 22: eb 01 jmp 0x25 24: c3 retq 25: b9 01 00 00 00 mov $0x1,%ecx 2a:* 0f 01 d0 xgetbv <-- trapping instruction Still, the question is, if so, why it was patched on non-XFD systems. Let me analyze the case a bit further with 0day folks. Thanks, Chang [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aa5cacdc29d76a005cbbee018a47faa6e724dd2d
On 3/10/2022 1:00 PM, Chang S. Bae wrote: > > BTW, now I'm suspicious of this JMP as patched at runtime with > fpu_state_size_dynamic(): No, this jump was supposed to be replaced with NOP by objtool but it didn't as fail to interpret TILERELEASE in this case. > > 22: eb 01 jmp 0x25 > 24: c3 retq > 25: b9 01 00 00 00 mov $0x1,%ecx > 2a:* 0f 01 d0 xgetbv <-- trapping instruction > > Still, the question is, if so, why it was patched on non-XFD systems. > Let me analyze the case a bit further with 0day folks. > Looks like 0day picked an internal branch where the instruction's opcode was intentionally removed. In practice, upstream code should accompany by a complete opcode table. If it ever happens, a warning follows like this on build: arch/x86/kernel/fpu/core.o: warning: objtool: can't decode instruction at .text:0x185e But what actually happened is barely indicated by this message alone. This decode failure ends up returning check() immediately [1] so the file is entirely skipped from the tool's process. I came to think of some improvements for this tool: (1) Add more messages like [2]. This may help users understand what happens in this build process. (2) Move on next byte from the failed offset like [3]. Perhaps, this continuation may alleviate the impact. It may misinterpret some bytes but I think it will be re-aligned with padding bytes before the next function (symbol). Include Josh Poimboeuf. Appreciate any feedback. Thanks, Chang [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/objtool/check.c#n3515 [2]: diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7c33ec67c4a9..34b60fa33fbe 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3507,27 +3507,27 @@ int check(struct objtool_file *file) set_func_state(&func_cfi); if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) - goto out; + goto err; cfi_hash_add(&init_cfi); cfi_hash_add(&func_cfi); ret = decode_sections(file); if (ret < 0) - goto out; + goto err; warnings += ret; if (list_empty(&file->insn_list)) - goto out; + return 0; if (vmlinux && !validate_dup) { ret = validate_vmlinux_functions(file); if (ret < 0) - goto out; + goto err; warnings += ret; - goto out; + return 0; } if (retpoline) { @@ -3539,37 +3539,37 @@ int check(struct objtool_file *file) ret = validate_functions(file); if (ret < 0) - goto out; + goto err; warnings += ret; ret = validate_unwind_hints(file, NULL); if (ret < 0) - goto out; + goto err; warnings += ret; if (!warnings) { ret = validate_reachable_instructions(file); if (ret < 0) - goto out; + goto err; warnings += ret; } ret = create_static_call_sections(file); if (ret < 0) - goto out; + goto err; warnings += ret; if (retpoline) { ret = create_retpoline_sites_sections(file); if (ret < 0) - goto out; + goto err; warnings += ret; } if (mcount) { ret = create_mcount_loc_sections(file); if (ret < 0) - goto out; + goto err; warnings += ret; } @@ -3580,11 +3580,14 @@ int check(struct objtool_file *file) printf("nr_cfi_cache: %ld\n", nr_cfi_cache); } -out: + return 0; + +err: /* * For now, don't fail the kernel build on fatal warnings.These * errors are still fairly common due to the growing matrix of * supported toolchains and their recent pace of change. */ + WARN("check failed - no jump_label instructions were written."); return 0; } [3]: diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7c33ec67c4a9..1f1515373ca5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -371,7 +371,7 @@ static int decode_instructions(struct objtool_file *file) !strcmp(sec->name, ".entry.text")) sec->noinstr = true; - for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { + for (offset = 0; offset < sec->sh.sh_size;) { insn = malloc(sizeof(*insn)); if (!insn) { WARN("malloc failed"); @@ -389,12 +389,15 @@ static int decode_instructions(struct objtool_file *file) &insn->len, &insn->type, &insn->immediate, &insn->stack_ops); - if (ret) - goto err; + if (ret) { + offset++; + continue; + } hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset)); list_add_tail(&insn->list, &file->insn_list); nr_insns++; + offset += insn->len; } list_for_each_entry(func, &sec->symbol_list, list) { @@ -416,10 +419,6 @@ static int decode_instructions(struct objtool_file *file) printf("nr_insns: %lu\n", nr_insns); return 0; - -err: - free(insn); - return ret; }