diff mbox series

[v3,bpf-next] bpf: make verifier log more relevant by default

Message ID 20200423195850.1259827-1-andriin@fb.com
State Superseded
Headers show
Series [v3,bpf-next] bpf: make verifier log more relevant by default | expand

Commit Message

Andrii Nakryiko April 23, 2020, 7:58 p.m. UTC
To make BPF verifier verbose log more releavant and easier to use to debug
verification failures, "pop" parts of log that were successfully verified.
This has effect of leaving only verifier logs that correspond to code branches
that lead to verification failure, which in practice should result in much
shorter and more relevant verifier log dumps. This behavior is made the
default behavior and can be overriden to do exhaustive logging by specifying
BPF_LOG_LEVEL2 log level.

Using BPF_LOG_LEVEL2 to disable this behavior is not ideal, because in some
cases it's good to have BPF_LOG_LEVEL2 per-instruction register dump
verbosity, but still have only relevant verifier branches logged. But for this
patch, I didn't want to add any new flags. It might be worth-while to just
rethink how BPF verifier logging is performed and requested and streamline it
a bit. But this trimming of successfully verified branches seems to be useful
and a good default behavior.

To test this, I modified runqslower slightly to introduce read of
uninitialized stack variable. Log (**truncated in the middle** to save many
lines out of this commit message) BEFORE this change:

; int handle__sched_switch(u64 *ctx)
0: (bf) r6 = r1
; struct task_struct *prev = (struct task_struct *)ctx[1];
1: (79) r1 = *(u64 *)(r6 +8)
func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct'
2: (b7) r2 = 0
; struct event event = {};
3: (7b) *(u64 *)(r10 -24) = r2
last_idx 3 first_idx 0
regs=4 stack=0 before 2: (b7) r2 = 0
4: (7b) *(u64 *)(r10 -32) = r2
5: (7b) *(u64 *)(r10 -40) = r2
6: (7b) *(u64 *)(r10 -48) = r2
; if (prev->state == TASK_RUNNING)

[ ... instruction dump from insn #7 through #50 are cut out ... ]

51: (b7) r2 = 16
52: (85) call bpf_get_current_comm#16
last_idx 52 first_idx 42
regs=4 stack=0 before 51: (b7) r2 = 16
; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
53: (bf) r1 = r6
54: (18) r2 = 0xffff8881f3868800
56: (18) r3 = 0xffffffff
58: (bf) r4 = r7
59: (b7) r5 = 32
60: (85) call bpf_perf_event_output#25
last_idx 60 first_idx 53
regs=20 stack=0 before 59: (b7) r5 = 32
61: (bf) r2 = r10
; event.pid = pid;
62: (07) r2 += -16
; bpf_map_delete_elem(&start, &pid);
63: (18) r1 = 0xffff8881f3868000
65: (85) call bpf_map_delete_elem#3
; }
66: (b7) r0 = 0
67: (95) exit

from 44 to 66: safe

from 34 to 66: safe

from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000
; bpf_map_update_elem(&start, &pid, &ts, 0);
28: (bf) r2 = r10
;
29: (07) r2 += -16
; tsp = bpf_map_lookup_elem(&start, &pid);
30: (18) r1 = 0xffff8881f3868000
32: (85) call bpf_map_lookup_elem#1
invalid indirect read from stack off -16+0 size 4
processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4

Notice how there is a successful code path from instruction 0 through 67, few
successfully verified jumps (44->66, 34->66), and only after that 11->28 jump
plus error on instruction #32.

AFTER this change (full verifier log, **no truncation**):

; int handle__sched_switch(u64 *ctx)
0: (bf) r6 = r1
; struct task_struct *prev = (struct task_struct *)ctx[1];
1: (79) r1 = *(u64 *)(r6 +8)
func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct'
2: (b7) r2 = 0
; struct event event = {};
3: (7b) *(u64 *)(r10 -24) = r2
last_idx 3 first_idx 0
regs=4 stack=0 before 2: (b7) r2 = 0
4: (7b) *(u64 *)(r10 -32) = r2
5: (7b) *(u64 *)(r10 -40) = r2
6: (7b) *(u64 *)(r10 -48) = r2
; if (prev->state == TASK_RUNNING)
7: (79) r2 = *(u64 *)(r1 +16)
; if (prev->state == TASK_RUNNING)
8: (55) if r2 != 0x0 goto pc+19
 R1_w=ptr_task_struct(id=0,off=0,imm=0) R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000
; trace_enqueue(prev->tgid, prev->pid);
9: (61) r1 = *(u32 *)(r1 +1184)
10: (63) *(u32 *)(r10 -4) = r1
; if (!pid || (targ_pid && targ_pid != pid))
11: (15) if r1 == 0x0 goto pc+16

from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000
; bpf_map_update_elem(&start, &pid, &ts, 0);
28: (bf) r2 = r10
;
29: (07) r2 += -16
; tsp = bpf_map_lookup_elem(&start, &pid);
30: (18) r1 = 0xffff8881db3ce800
32: (85) call bpf_map_lookup_elem#1
invalid indirect read from stack off -16+0 size 4
processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4

Notice how in this case, there are 0-11 instructions + jump from 11 to
28 is recorded + 28-32 instructions with error on insn #32.

test_verifier test runner was updated to specify BPF_LOG_LEVEL2 for
VERBOSE_ACCEPT expected result due to potentially "incomplete" success verbose
log at BPF_LOG_LEVEL1.

On success, verbose log will only have a summary of number of processed
instructions, etc, but no branch tracing log. Having just a last succesful
branch tracing seemed weird and confusing. Having small and clean summary log
in success case seems quite logical and nice, though.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 kernel/bpf/verifier.c                       | 29 ++++++++++++++++++---
 tools/testing/selftests/bpf/test_verifier.c |  7 ++++-
 2 files changed, 31 insertions(+), 5 deletions(-)

Comments

Jiri Benc Nov. 5, 2020, 4:02 p.m. UTC | #1
On Thu, 23 Apr 2020 12:58:50 -0700, Andrii Nakryiko wrote:
> To make BPF verifier verbose log more releavant and easier to use to debug
> verification failures, "pop" parts of log that were successfully verified.
> This has effect of leaving only verifier logs that correspond to code branches
> that lead to verification failure, which in practice should result in much
> shorter and more relevant verifier log dumps. This behavior is made the
> default behavior and can be overriden to do exhaustive logging by specifying
> BPF_LOG_LEVEL2 log level.

This patch broke the test_offload.py selftest:

[...]
Test TC offloads work...
FAIL: Missing or incorrect message from netdevsim in verifier log
[...]

The selftest expects to receive "[netdevsim] Hello from netdevsim!" in
the log (coming from nsim_bpf_verify_insn) but that part of the log is
cleared by bpf_vlog_reset added by this patch.

How can this be fixed? The log level 1 comes from the "verbose" keyword
passed to tc, I don't think it should be increased to 2.

On a related note, the selftest had to start failing after this commit.
It's a bit surprising it did not get caught, is there a bug somewhere
in the test matrix?

Thanks,

 Jiri
Andrii Nakryiko Nov. 5, 2020, 9:22 p.m. UTC | #2
On Thu, Nov 5, 2020 at 8:02 AM Jiri Benc <jbenc@redhat.com> wrote:
>

> On Thu, 23 Apr 2020 12:58:50 -0700, Andrii Nakryiko wrote:

> > To make BPF verifier verbose log more releavant and easier to use to debug

> > verification failures, "pop" parts of log that were successfully verified.

> > This has effect of leaving only verifier logs that correspond to code branches

> > that lead to verification failure, which in practice should result in much

> > shorter and more relevant verifier log dumps. This behavior is made the

> > default behavior and can be overriden to do exhaustive logging by specifying

> > BPF_LOG_LEVEL2 log level.

>

> This patch broke the test_offload.py selftest:

>

> [...]

> Test TC offloads work...

> FAIL: Missing or incorrect message from netdevsim in verifier log

> [...]

>

> The selftest expects to receive "[netdevsim] Hello from netdevsim!" in

> the log (coming from nsim_bpf_verify_insn) but that part of the log is

> cleared by bpf_vlog_reset added by this patch.


Should we just drop check_verifier_log() checks?

>

> How can this be fixed? The log level 1 comes from the "verbose" keyword

> passed to tc, I don't think it should be increased to 2.

>

> On a related note, the selftest had to start failing after this commit.

> It's a bit surprising it did not get caught, is there a bug somewhere

> in the test matrix?


test_progs is the only test runner that's run continuously on every
patch. libbpf CI also runs test_maps and test_verifier. All the other
test binaries/scripts rely on humans to not forget about them. Which
works so-so, as you can see :)

>

> Thanks,

>

>  Jiri

>
Jakub Kicinski Nov. 5, 2020, 9:53 p.m. UTC | #3
On Thu, 5 Nov 2020 13:22:12 -0800 Andrii Nakryiko wrote:
> On Thu, Nov 5, 2020 at 8:02 AM Jiri Benc <jbenc@redhat.com> wrote:

> > On Thu, 23 Apr 2020 12:58:50 -0700, Andrii Nakryiko wrote:  

> > > To make BPF verifier verbose log more releavant and easier to use to debug

> > > verification failures, "pop" parts of log that were successfully verified.

> > > This has effect of leaving only verifier logs that correspond to code branches

> > > that lead to verification failure, which in practice should result in much

> > > shorter and more relevant verifier log dumps. This behavior is made the

> > > default behavior and can be overriden to do exhaustive logging by specifying

> > > BPF_LOG_LEVEL2 log level.  

> >

> > This patch broke the test_offload.py selftest:

> >

> > [...]

> > Test TC offloads work...

> > FAIL: Missing or incorrect message from netdevsim in verifier log

> > [...]

> >

> > The selftest expects to receive "[netdevsim] Hello from netdevsim!" in

> > the log (coming from nsim_bpf_verify_insn) but that part of the log is

> > cleared by bpf_vlog_reset added by this patch.  

> 

> Should we just drop check_verifier_log() checks?


Drivers only print error messages when something goes wrong, so the
messages are high priority. IIUC this change was just supposed to
decrease verbosity, right?
Andrii Nakryiko Nov. 5, 2020, 10:41 p.m. UTC | #4
On Thu, Nov 5, 2020 at 1:53 PM Jakub Kicinski <kuba@kernel.org> wrote:
>

> On Thu, 5 Nov 2020 13:22:12 -0800 Andrii Nakryiko wrote:

> > On Thu, Nov 5, 2020 at 8:02 AM Jiri Benc <jbenc@redhat.com> wrote:

> > > On Thu, 23 Apr 2020 12:58:50 -0700, Andrii Nakryiko wrote:

> > > > To make BPF verifier verbose log more releavant and easier to use to debug

> > > > verification failures, "pop" parts of log that were successfully verified.

> > > > This has effect of leaving only verifier logs that correspond to code branches

> > > > that lead to verification failure, which in practice should result in much

> > > > shorter and more relevant verifier log dumps. This behavior is made the

> > > > default behavior and can be overriden to do exhaustive logging by specifying

> > > > BPF_LOG_LEVEL2 log level.

> > >

> > > This patch broke the test_offload.py selftest:

> > >

> > > [...]

> > > Test TC offloads work...

> > > FAIL: Missing or incorrect message from netdevsim in verifier log

> > > [...]

> > >

> > > The selftest expects to receive "[netdevsim] Hello from netdevsim!" in

> > > the log (coming from nsim_bpf_verify_insn) but that part of the log is

> > > cleared by bpf_vlog_reset added by this patch.

> >

> > Should we just drop check_verifier_log() checks?

>

> Drivers only print error messages when something goes wrong, so the

> messages are high priority. IIUC this change was just supposed to

> decrease verbosity, right?


Seems like check_verifier_log() in test_offline.py is only called for
successful cases. This patch truncates parts of the verifier log that
correspond to successfully validated code paths, so that in case if
verification fails, only relevant parts are left. So for completely
successful verification the log will be almost empty, with only final
stats available.
Jakub Kicinski Nov. 5, 2020, 10:57 p.m. UTC | #5
On Thu, 5 Nov 2020 14:41:12 -0800 Andrii Nakryiko wrote:
> On Thu, Nov 5, 2020 at 1:53 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > On Thu, 5 Nov 2020 13:22:12 -0800 Andrii Nakryiko wrote:  
> > > Should we just drop check_verifier_log() checks?  
> >
> > Drivers only print error messages when something goes wrong, so the
> > messages are high priority. IIUC this change was just supposed to
> > decrease verbosity, right?  
> 
> Seems like check_verifier_log() in test_offline.py is only called for
> successful cases. This patch truncates parts of the verifier log that
> correspond to successfully validated code paths, so that in case if
> verification fails, only relevant parts are left. So for completely
> successful verification the log will be almost empty, with only final
> stats available.

If you're saying the driver message would still be there if
verification or translation failed that's perfectly fine, we 
can definitely adjust the test. But some check that driver 
message reporting is working is needed, don't just remove it.

Sorry, don't have cycles to look closely :(
Jiri Benc Nov. 6, 2020, 8:24 a.m. UTC | #6
On Thu, 5 Nov 2020 14:57:13 -0800, Jakub Kicinski wrote:
> If you're saying the driver message would still be there if

> verification or translation failed that's perfectly fine, we 

> can definitely adjust the test. But some check that driver 

> message reporting is working is needed, don't just remove it.


Should we change the test to fail the verification? Sounds reasonable
to me.

 Jiri
Jiri Benc Nov. 6, 2020, 8:26 a.m. UTC | #7
On Thu, 5 Nov 2020 13:22:12 -0800, Andrii Nakryiko wrote:
> test_progs is the only test runner that's run continuously on every
> patch. libbpf CI also runs test_maps and test_verifier. All the other
> test binaries/scripts rely on humans to not forget about them. Which
> works so-so, as you can see :)

Did not know that, thanks for the explanation.

Would be good to add full testing to some of the numerous CIs we have
(some taker? :-)).

 Jiri
Jakub Kicinski Nov. 6, 2020, 6:52 p.m. UTC | #8
On Fri, 6 Nov 2020 09:24:48 +0100 Jiri Benc wrote:
> On Thu, 5 Nov 2020 14:57:13 -0800, Jakub Kicinski wrote:

> > If you're saying the driver message would still be there if

> > verification or translation failed that's perfectly fine, we 

> > can definitely adjust the test. But some check that driver 

> > message reporting is working is needed, don't just remove it.  

> 

> Should we change the test to fail the verification? Sounds reasonable

> to me.


Yeah, you'd need a new debugfs knob in netdevsim to trigger the failure
from instruction verification, then (to match where message is printed).
And make the output validation a separate case, not just a check in
some random success load.
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38cfcf701eeb..2a98d9fb2eef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -168,6 +168,8 @@  struct bpf_verifier_stack_elem {
 	int insn_idx;
 	int prev_insn_idx;
 	struct bpf_verifier_stack_elem *next;
+	/* length of verifier log at the time this state was pushed on stack */
+	u32 log_pos;
 };
 
 #define BPF_COMPLEXITY_LIMIT_JMP_SEQ	8192
@@ -283,6 +285,18 @@  void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		log->ubuf = NULL;
 }
 
+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+	char zero = 0;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	log->len_used = new_pos;
+	if (put_user(zero, log->ubuf + new_pos))
+		log->ubuf = NULL;
+}
+
 /* log_level controls verbosity level of eBPF verifier.
  * bpf_verifier_log_write() is used to dump the verification trace to the log,
  * so the user can figure out what's wrong with the program
@@ -846,7 +860,7 @@  static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
 }
 
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
-		     int *insn_idx)
+		     int *insn_idx, bool pop_log)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem, *head = env->head;
@@ -860,6 +874,8 @@  static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 		if (err)
 			return err;
 	}
+	if (pop_log)
+		bpf_vlog_reset(&env->log, head->log_pos);
 	if (insn_idx)
 		*insn_idx = head->insn_idx;
 	if (prev_insn_idx)
@@ -887,6 +903,7 @@  static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
+	elem->log_pos = env->log.len_used;
 	env->head = elem;
 	env->stack_size++;
 	err = copy_verifier_state(&elem->st, cur);
@@ -915,7 +932,7 @@  static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	free_verifier_state(env->cur_state, true);
 	env->cur_state = NULL;
 	/* pop all elements and return */
-	while (!pop_stack(env, NULL, NULL));
+	while (!pop_stack(env, NULL, NULL, false));
 	return NULL;
 }
 
@@ -8399,6 +8416,7 @@  static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
 
 static int do_check(struct bpf_verifier_env *env)
 {
+	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_reg_state *regs;
@@ -8675,7 +8693,7 @@  static int do_check(struct bpf_verifier_env *env)
 process_bpf_exit:
 				update_branch_counts(env, env->cur_state);
 				err = pop_stack(env, &prev_insn_idx,
-						&env->insn_idx);
+						&env->insn_idx, pop_log);
 				if (err < 0) {
 					if (err != -ENOENT)
 						return err;
@@ -10198,6 +10216,7 @@  static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
 
 static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
+	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state;
 	struct bpf_reg_state *regs;
 	int ret, i;
@@ -10260,7 +10279,9 @@  static int do_check_common(struct bpf_verifier_env *env, int subprog)
 		free_verifier_state(env->cur_state, true);
 		env->cur_state = NULL;
 	}
-	while (!pop_stack(env, NULL, NULL));
+	while (!pop_stack(env, NULL, NULL, false));
+	if (!ret && pop_log)
+		bpf_vlog_reset(&env->log, 0);
 	free_states(env);
 	if (ret)
 		/* clean aux data in case subprog was rejected */
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 87eaa49609a0..ad6939c67c5e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -943,7 +943,12 @@  static void do_test_single(struct bpf_test *test, bool unpriv,
 	attr.insns = prog;
 	attr.insns_cnt = prog_len;
 	attr.license = "GPL";
-	attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4;
+	if (verbose)
+		attr.log_level = 1;
+	else if (expected_ret == VERBOSE_ACCEPT)
+		attr.log_level = 2;
+	else
+		attr.log_level = 4;
 	attr.prog_flags = pflags;
 
 	fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));