Message ID | 1405424541-21803-4-git-send-email-alex.bennee@linaro.org |
---|---|
State | New |
Headers | show |
Hi, Am 15.07.2014 13:42, schrieb Alex Bennée: > The tb_find_fast path is important to quickly moving from one block to > the next. However we need to flush it when tlb changes occur so it's > important to know how well we are doing with the cache. > > This patch adds some basic hit/miss profiling to the tb_find_fast > tracepoint as well as a number of other tb_ related areas. I've also > added a trace_inc_counter() helper which gets inlined away when tracing > is disabled. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > > diff --git a/cpu-exec.c b/cpu-exec.c > index 45ef77b..771272f 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -187,7 +187,10 @@ static inline TranslationBlock *tb_find_fast(CPUArchState *env) > tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]; > if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || > tb->flags != flags)) { > + trace_inc_counter(&cpu->tb_jmp_cache_stats.misses); > tb = tb_find_slow(env, pc, cs_base, flags); > + } else { > + trace_inc_counter(&cpu->tb_jmp_cache_stats.hits); > } > return tb; > } > diff --git a/cputlb.c b/cputlb.c > index 7bd3573..672656a 100644 > --- a/cputlb.c > +++ b/cputlb.c > @@ -58,7 +58,7 @@ void tlb_flush(CPUState *cpu, int flush_global) > cpu->current_tb = NULL; > > memset(env->tlb_table, -1, sizeof(env->tlb_table)); > - memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); > + tb_flush_all_jmp_cache(cpu); > > env->tlb_flush_addr = -1; > env->tlb_flush_mask = 0; > diff --git a/include/qom/cpu.h b/include/qom/cpu.h > index df977c8..8376678 100644 > --- a/include/qom/cpu.h > +++ b/include/qom/cpu.h > @@ -243,6 +243,10 @@ struct CPUState { > void *env_ptr; /* CPUArchState */ > struct TranslationBlock *current_tb; > struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE]; > + struct { > + int hits; > + int misses; Is anything else going to be added here? If not, the indentation can be dropped. > + } tb_jmp_cache_stats; This is lacking documentation. Should be trivial to add for this field (not here, above the struct). To document the subfields we may need to name the struct. > struct GDBRegisterState *gdb_regs; > int gdb_num_regs; > int gdb_num_g_regs; > @@ -584,6 +588,15 @@ void cpu_exit(CPUState *cpu); > */ > void cpu_resume(CPUState *cpu); > > + > +/** > + * tb_flush_all_jmp_cache: > + * @cpu: The CPU jmp cache to flush > + * > + * Flush all the entries from the cpu fast jump cache "CPU" for consistency > + */ > +void tb_flush_all_jmp_cache(CPUState *cpu); > + > /** > * qemu_init_vcpu: > * @cpu: The vCPU to initialize. > diff --git a/include/trace.h b/include/trace.h > index c15f498..7a9c0dc 100644 > --- a/include/trace.h > +++ b/include/trace.h > @@ -3,4 +3,14 @@ > > #include "trace/generated-tracers.h" > > +#ifndef CONFIG_TRACE_NOP > +static inline void trace_inc_counter(int *counter) { > + int cnt = *counter; > + cnt++; > + *counter = cnt; > +} > +#else > +static inline void trace_inc_counter(int *counter) { /* do nothing */ } > +#endif > + > #endif /* TRACE_H */ Coding Style issues with the first function. For simplicity just keep the first implementation but with the proper brace placement, and then just put the #ifdef into the function body. That avoids the signatures getting out of sync. > diff --git a/qom/cpu.c b/qom/cpu.c > index fada2d4..956b36d 100644 > --- a/qom/cpu.c > +++ b/qom/cpu.c > @@ -244,7 +244,7 @@ static void cpu_common_reset(CPUState *cpu) > cpu->icount_extra = 0; > cpu->icount_decr.u32 = 0; > cpu->can_do_io = 0; > - memset(cpu->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof(void *)); > + tb_flush_all_jmp_cache(cpu); > } > > static bool cpu_common_has_work(CPUState *cs) > diff --git a/trace-events b/trace-events > index f8cc35f..5a58a11 100644 > --- a/trace-events > +++ b/trace-events > @@ -1244,6 +1244,9 @@ exec_tb_exit(void *next_tb, unsigned int flags) "tb:%p flags=%x" > > # translate-all.c > translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%x, tb_code:%p" > +tb_flush(void) "" > +tb_flush_jump_cache(uintptr_t pc) "pc:0x%x" > +tb_flush_all_jump_cache(int hits, int misses) "hits:%d misses:%d" > > # memory.c > memory_region_ops_read(void *mr, uint64_t addr, uint64_t value, unsigned size) "mr %p addr %#"PRIx64" value %#"PRIx64" size %u" > diff --git a/translate-all.c b/translate-all.c > index a11c083..8e7bbcc 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -714,12 +714,22 @@ static void page_flush_tb(void) > } > } > > +void tb_flush_all_jmp_cache(CPUState *cpu) > +{ > + trace_tb_flush_all_jump_cache(cpu->tb_jmp_cache_stats.hits, > + cpu->tb_jmp_cache_stats.misses); > + memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); > + memset((void *) &cpu->tb_jmp_cache_stats, 0, sizeof(cpu->tb_jmp_cache_stats)); > +} > + > /* flush all the translation blocks */ > /* XXX: tb_flush is currently not thread safe */ > void tb_flush(CPUArchState *env1) > { > CPUState *cpu = ENV_GET_CPU(env1); > > + trace_tb_flush(); > + > #if defined(DEBUG_FLUSH) > printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n", > (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer), > @@ -734,7 +744,7 @@ void tb_flush(CPUArchState *env1) > tcg_ctx.tb_ctx.nb_tbs = 0; > > CPU_FOREACH(cpu) { > - memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); > + tb_flush_all_jmp_cache(cpu); > } > > memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash)); > @@ -1520,6 +1530,8 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr) > i = tb_jmp_cache_hash_page(addr); > memset(&cpu->tb_jmp_cache[i], 0, > TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); Can this one be dropped, too? > + > + trace_tb_flush_jump_cache(addr); > } > > void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) Cheers, Andreas
On 15 July 2014 12:42, Alex Bennée <alex.bennee@linaro.org> wrote: > +#ifndef CONFIG_TRACE_NOP > +static inline void trace_inc_counter(int *counter) { > + int cnt = *counter; > + cnt++; > + *counter = cnt; > +} ...why isn't this just "*counter++;" ? -- PMM
On 15 July 2014 13:23, Peter Maydell <peter.maydell@linaro.org> wrote: > On 15 July 2014 12:42, Alex Bennée <alex.bennee@linaro.org> wrote: >> +#ifndef CONFIG_TRACE_NOP >> +static inline void trace_inc_counter(int *counter) { >> + int cnt = *counter; >> + cnt++; >> + *counter = cnt; >> +} > > ...why isn't this just "*counter++;" ? Derp. (*counter)++; I leave for the reader to decide whether this constitutes an argument in favour of the way you originally phrased it... -- PMM
Peter Maydell writes: > On 15 July 2014 12:42, Alex Bennée <alex.bennee@linaro.org> wrote: >> +#ifndef CONFIG_TRACE_NOP >> +static inline void trace_inc_counter(int *counter) { >> + int cnt = *counter; >> + cnt++; >> + *counter = cnt; >> +} > > ...why isn't this just "*counter++;" ? You of course mean: (*counter)++; I'll fix that up for the next iteration...
Andreas Färber writes: > Hi, > > Am 15.07.2014 13:42, schrieb Alex Bennée: <snip> >> index df977c8..8376678 100644 >> --- a/include/qom/cpu.h >> +++ b/include/qom/cpu.h >> @@ -243,6 +243,10 @@ struct CPUState { >> void *env_ptr; /* CPUArchState */ >> struct TranslationBlock *current_tb; >> struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE]; >> + struct { >> + int hits; >> + int misses; > > Is anything else going to be added here? If not, the indentation can be > dropped. At the moment probably not. > >> + } tb_jmp_cache_stats; > > This is lacking documentation. Should be trivial to add for this field > (not here, above the struct). To document the subfields we may need to > name the struct. Would not a simple comment be enough? > >> struct GDBRegisterState *gdb_regs; >> int gdb_num_regs; >> int gdb_num_g_regs; >> @@ -584,6 +588,15 @@ void cpu_exit(CPUState *cpu); >> */ >> void cpu_resume(CPUState *cpu); >> >> + >> +/** >> + * tb_flush_all_jmp_cache: >> + * @cpu: The CPU jmp cache to flush >> + * >> + * Flush all the entries from the cpu fast jump cache > > "CPU" for consistency OK > <snip> >> +#ifndef CONFIG_TRACE_NOP >> +static inline void trace_inc_counter(int *counter) { >> + int cnt = *counter; >> + cnt++; >> + *counter = cnt; >> +} >> +#else >> +static inline void trace_inc_counter(int *counter) { /* do nothing */ } >> +#endif >> + >> #endif /* TRACE_H */ > > Coding Style issues with the first function. For simplicity just keep > the first implementation but with the proper brace placement, and then > just put the #ifdef into the function body. That avoids the signatures > getting out of sync. mea-culpa, I forgot to run checkpatch.pl.... >> >> memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash)); >> @@ -1520,6 +1530,8 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr) >> i = tb_jmp_cache_hash_page(addr); >> memset(&cpu->tb_jmp_cache[i], 0, >> TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); > > Can this one be dropped, too? No, this is only a partial invalidation. I did toy with instrumenting how many entries are flushed but it didn't seem the be worth it given with the current architecture there is not much we can do. > >> + >> + trace_tb_flush_jump_cache(addr); >> } >> >> void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) > > Cheers, > Andreas
Il 15/07/2014 13:42, Alex Bennée ha scritto: > + trace_inc_counter(&cpu->tb_jmp_cache_stats.misses); > tb = tb_find_slow(env, pc, cs_base, flags); > + } else { > + trace_inc_counter(&cpu->tb_jmp_cache_stats.hits); > } I think this is premature optimization... Paolo
Paolo Bonzini writes: > Il 15/07/2014 13:42, Alex Bennée ha scritto: >> + trace_inc_counter(&cpu->tb_jmp_cache_stats.misses); >> tb = tb_find_slow(env, pc, cs_base, flags); >> + } else { >> + trace_inc_counter(&cpu->tb_jmp_cache_stats.hits); >> } > > I think this is premature optimization... How do you mean? It's not really an optimization as much as an instrumentation. It should compile away to nothing if you don't have tracing enabled in your build. OTOH the numbers I'm seeing are very interesting in so far as the fast path could be a potential waste of code in a lot of cases.
Il 15/07/2014 16:16, Alex Bennée ha scritto: >> > I think this is premature optimization... > How do you mean? It's not really an optimization as much as an > instrumentation. It should compile away to nothing if you don't have > tracing enabled in your build. I think it's not a big deal if you always enable the counting, and perhaps show them in "info jit". Paolo > OTOH the numbers I'm seeing are very interesting in so far as the fast > path could be a potential waste of code in a lot of cases.
On 15 July 2014 21:11, Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 15/07/2014 16:16, Alex Bennée ha scritto: > >>> > I think this is premature optimization... >> >> How do you mean? It's not really an optimization as much as an >> instrumentation. It should compile away to nothing if you don't have >> tracing enabled in your build. > > > I think it's not a big deal if you always enable the counting, and perhaps > show them in "info jit". We don't enable any other tracepoints by default; why would we want to enable just this one which is in a hot codepath?? -- PMM
Il 15/07/2014 22:29, Peter Maydell ha scritto: >> > >> > I think it's not a big deal if you always enable the counting, and perhaps >> > show them in "info jit". > We don't enable any other tracepoints by default; why would > we want to enable just this one which is in a hot codepath?? I'm not referring to the tracepoint, only to trace_inc_counter instead of just "x++". The overhead is probably not measurable. There are a few low-hanging fruit optimizations in cpu-exec.c that probably would give more measurable benefit, for example trying to replace cpu_loop_exit with a goto whenever possible. Paolo
diff --git a/cpu-exec.c b/cpu-exec.c index 45ef77b..771272f 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -187,7 +187,10 @@ static inline TranslationBlock *tb_find_fast(CPUArchState *env) tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]; if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || tb->flags != flags)) { + trace_inc_counter(&cpu->tb_jmp_cache_stats.misses); tb = tb_find_slow(env, pc, cs_base, flags); + } else { + trace_inc_counter(&cpu->tb_jmp_cache_stats.hits); } return tb; } diff --git a/cputlb.c b/cputlb.c index 7bd3573..672656a 100644 --- a/cputlb.c +++ b/cputlb.c @@ -58,7 +58,7 @@ void tlb_flush(CPUState *cpu, int flush_global) cpu->current_tb = NULL; memset(env->tlb_table, -1, sizeof(env->tlb_table)); - memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); + tb_flush_all_jmp_cache(cpu); env->tlb_flush_addr = -1; env->tlb_flush_mask = 0; diff --git a/include/qom/cpu.h b/include/qom/cpu.h index df977c8..8376678 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -243,6 +243,10 @@ struct CPUState { void *env_ptr; /* CPUArchState */ struct TranslationBlock *current_tb; struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE]; + struct { + int hits; + int misses; + } tb_jmp_cache_stats; struct GDBRegisterState *gdb_regs; int gdb_num_regs; int gdb_num_g_regs; @@ -584,6 +588,15 @@ void cpu_exit(CPUState *cpu); */ void cpu_resume(CPUState *cpu); + +/** + * tb_flush_all_jmp_cache: + * @cpu: The CPU jmp cache to flush + * + * Flush all the entries from the cpu fast jump cache + */ +void tb_flush_all_jmp_cache(CPUState *cpu); + /** * qemu_init_vcpu: * @cpu: The vCPU to initialize. diff --git a/include/trace.h b/include/trace.h index c15f498..7a9c0dc 100644 --- a/include/trace.h +++ b/include/trace.h @@ -3,4 +3,14 @@ #include "trace/generated-tracers.h" +#ifndef CONFIG_TRACE_NOP +static inline void trace_inc_counter(int *counter) { + int cnt = *counter; + cnt++; + *counter = cnt; +} +#else +static inline void trace_inc_counter(int *counter) { /* do nothing */ } +#endif + #endif /* TRACE_H */ diff --git a/qom/cpu.c b/qom/cpu.c index fada2d4..956b36d 100644 --- a/qom/cpu.c +++ b/qom/cpu.c @@ -244,7 +244,7 @@ static void cpu_common_reset(CPUState *cpu) cpu->icount_extra = 0; cpu->icount_decr.u32 = 0; cpu->can_do_io = 0; - memset(cpu->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof(void *)); + tb_flush_all_jmp_cache(cpu); } static bool cpu_common_has_work(CPUState *cs) diff --git a/trace-events b/trace-events index f8cc35f..5a58a11 100644 --- a/trace-events +++ b/trace-events @@ -1244,6 +1244,9 @@ exec_tb_exit(void *next_tb, unsigned int flags) "tb:%p flags=%x" # translate-all.c translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%x, tb_code:%p" +tb_flush(void) "" +tb_flush_jump_cache(uintptr_t pc) "pc:0x%x" +tb_flush_all_jump_cache(int hits, int misses) "hits:%d misses:%d" # memory.c memory_region_ops_read(void *mr, uint64_t addr, uint64_t value, unsigned size) "mr %p addr %#"PRIx64" value %#"PRIx64" size %u" diff --git a/translate-all.c b/translate-all.c index a11c083..8e7bbcc 100644 --- a/translate-all.c +++ b/translate-all.c @@ -714,12 +714,22 @@ static void page_flush_tb(void) } } +void tb_flush_all_jmp_cache(CPUState *cpu) +{ + trace_tb_flush_all_jump_cache(cpu->tb_jmp_cache_stats.hits, + cpu->tb_jmp_cache_stats.misses); + memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); + memset((void *) &cpu->tb_jmp_cache_stats, 0, sizeof(cpu->tb_jmp_cache_stats)); +} + /* flush all the translation blocks */ /* XXX: tb_flush is currently not thread safe */ void tb_flush(CPUArchState *env1) { CPUState *cpu = ENV_GET_CPU(env1); + trace_tb_flush(); + #if defined(DEBUG_FLUSH) printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n", (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer), @@ -734,7 +744,7 @@ void tb_flush(CPUArchState *env1) tcg_ctx.tb_ctx.nb_tbs = 0; CPU_FOREACH(cpu) { - memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); + tb_flush_all_jmp_cache(cpu); } memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash)); @@ -1520,6 +1530,8 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr) i = tb_jmp_cache_hash_page(addr); memset(&cpu->tb_jmp_cache[i], 0, TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); + + trace_tb_flush_jump_cache(addr); } void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
The tb_find_fast path is important to quickly moving from one block to the next. However we need to flush it when tlb changes occur so it's important to know how well we are doing with the cache. This patch adds some basic hit/miss profiling to the tb_find_fast tracepoint as well as a number of other tb_ related areas. I've also added a trace_inc_counter() helper which gets inlined away when tracing is disabled. Signed-off-by: Alex Bennée <alex.bennee@linaro.org>