@@ -9,6 +9,10 @@
#include "qemu/osdep.h"
#include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg.h"
+
+#include "qemu/qemu-print.h"
#include "exec/tb-stats.h"
@@ -18,6 +22,97 @@ enum TBStatsStatus { TB_STATS_RUNNING, TB_STATS_PAUSED, TB_STATS_STOPPED };
static enum TBStatsStatus tcg_collect_tb_stats;
static uint32_t default_tbstats_flag;
+struct jit_profile_info {
+ uint64_t translations;
+ uint64_t aborted;
+ uint64_t ops;
+ unsigned ops_max;
+ uint64_t del_ops;
+ uint64_t temps;
+ unsigned temps_max;
+ uint64_t host;
+ uint64_t guest;
+ uint64_t search_data;
+};
+
+/* accumulate the statistics from all TBs */
+static void collect_jit_profile_info(void *p, uint32_t hash, void *userp)
+{
+ struct jit_profile_info *jpi = userp;
+ TBStatistics *tbs = p;
+
+ jpi->translations += tbs->translations.total;
+ jpi->ops += tbs->code.num_tcg_ops;
+ if (stat_per_translation(tbs, code.num_tcg_ops) > jpi->ops_max) {
+ jpi->ops_max = stat_per_translation(tbs, code.num_tcg_ops);
+ }
+ jpi->del_ops += tbs->code.deleted_ops;
+ jpi->temps += tbs->code.temps;
+ if (stat_per_translation(tbs, code.temps) > jpi->temps_max) {
+ jpi->temps_max = stat_per_translation(tbs, code.temps);
+ }
+ jpi->host += tbs->code.out_len;
+ jpi->guest += tbs->code.in_len;
+ jpi->search_data += tbs->code.search_out_len;
+}
+
+/* dump JIT statisticis using TCGProfile and TBStats */
+void dump_jit_profile_info(TCGProfile *s)
+{
+ if (!tb_stats_collection_enabled()) {
+ return;
+ }
+
+ struct jit_profile_info *jpi = g_new0(struct jit_profile_info, 1);
+
+ qht_iter(&tb_ctx.tb_stats, collect_jit_profile_info, jpi);
+
+ if (jpi->translations) {
+ qemu_printf("translated TBs %" PRId64 "\n", jpi->translations);
+ qemu_printf("avg ops/TB %0.1f max=%d\n",
+ jpi->ops / (double) jpi->translations, jpi->ops_max);
+ qemu_printf("deleted ops/TB %0.2f\n",
+ jpi->del_ops / (double) jpi->translations);
+ qemu_printf("avg temps/TB %0.2f max=%d\n",
+ jpi->temps / (double) jpi->translations, jpi->temps_max);
+ qemu_printf("avg host code/TB %0.1f\n",
+ jpi->host / (double) jpi->translations);
+ qemu_printf("avg search data/TB %0.1f\n",
+ jpi->search_data / (double) jpi->translations);
+
+ if (s) {
+ int64_t tot = s->interm_time + s->code_time;
+ qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n",
+ tot, tot / 2.4e9);
+ qemu_printf("cycles/op %0.1f\n",
+ jpi->ops ? (double)tot / jpi->ops : 0);
+ qemu_printf("cycles/in byte %0.1f\n",
+ jpi->guest ? (double)tot / jpi->guest : 0);
+ qemu_printf("cycles/out byte %0.1f\n",
+ jpi->host ? (double)tot / jpi->host : 0);
+ qemu_printf("cycles/search byte %0.1f\n",
+ jpi->search_data ? (double)tot / jpi->search_data : 0);
+ if (tot == 0) {
+ tot = 1;
+ }
+ qemu_printf(" gen_interm time %0.1f%%\n",
+ (double)s->interm_time / tot * 100.0);
+ qemu_printf(" gen_code time %0.1f%%\n",
+ (double)s->code_time / tot * 100.0);
+ qemu_printf("optim./code time %0.1f%%\n",
+ (double)s->opt_time / (s->code_time ? s->code_time : 1)
+ * 100.0);
+ qemu_printf("liveness/code time %0.1f%%\n",
+ (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
+ qemu_printf("cpu_restore count %" PRId64 "\n",
+ s->restore_count);
+ qemu_printf(" avg cycles %0.1f\n",
+ s->restore_count ? (double)s->restore_time / s->restore_count : 0);
+ }
+ }
+ g_free(jpi);
+}
+
void init_tb_stats_htable_if_not(void)
{
if (tb_stats_collection_enabled() && !tb_ctx.tb_stats.map) {
@@ -1780,8 +1780,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
tb_overflow:
#ifdef CONFIG_PROFILER
- /* includes aborted translations because of exceptions */
- atomic_set(&prof->tb_count1, prof->tb_count1 + 1);
ti = profile_getclock();
#endif
@@ -1807,6 +1805,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
tb->tb_stats = NULL;
}
+
tcg_func_start(tcg_ctx);
tcg_ctx->cpu = env_cpu(env);
@@ -1828,7 +1827,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
}
#ifdef CONFIG_PROFILER
- atomic_set(&prof->tb_count, prof->tb_count + 1);
atomic_set(&prof->interm_time, prof->interm_time + profile_getclock() - ti);
ti = profile_getclock();
#endif
@@ -1875,9 +1873,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
#ifdef CONFIG_PROFILER
atomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
- atomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
- atomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
- atomic_set(&prof->search_out_len, prof->search_out_len + search_size);
#endif
#ifdef DEBUG_DISAS
@@ -1951,7 +1946,11 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
ts->code.num_tcg_ops += prof->translation.nb_ops_pre_opt;
ts->code.num_tcg_ops_opt += tcg_ctx->nb_ops;
ts->code.spills += prof->translation.nb_spills;
+ ts->code.temps += prof->translation.temp_count;
+ ts->code.deleted_ops += prof->translation.del_op_count;
+ ts->code.in_len += tb->size;
ts->code.out_len += tb->tc.size;
+ ts->code.search_out_len += search_size;
ts->translations.total++;
if (phys_page2 != -1) {
@@ -33,6 +33,9 @@
#define tb_stats_enabled(tb, JIT_STATS) \
(tb && tb->tb_stats && (tb->tb_stats->stats_enabled & JIT_STATS))
+#define stat_per_translation(stat, name) \
+ (stat->translations.total ? stat->name / stat->translations.total : 0)
+
typedef struct TBStatistics TBStatistics;
/*
@@ -67,7 +70,13 @@ struct TBStatistics {
unsigned num_tcg_ops;
unsigned num_tcg_ops_opt;
unsigned spills;
+
+ /* CONFIG_PROFILE */
+ unsigned temps;
+ unsigned deleted_ops;
+ unsigned in_len;
unsigned out_len;
+ unsigned search_out_len;
} code;
struct {
@@ -87,6 +96,8 @@ bool tb_stats_cmp(const void *ap, const void *bp);
void init_tb_stats_htable_if_not(void);
+void dump_jit_profile_info(TCGProfile *s);
+
#define TB_NOTHING (1 << 0)
#define TB_EXEC_STATS (1 << 1)
#define TB_JIT_STATS (1 << 2)
@@ -1118,6 +1118,8 @@ void tcg_func_start(TCGContext *s)
s->current_frame_offset = s->frame_start;
s->prof.translation.nb_spills = 0;
+ s->prof.translation.del_op_count = 0;
+ s->prof.translation.temp_count = 0;
#ifdef CONFIG_DEBUG_TCG
s->goto_tb_issue_mask = 0;
#endif
@@ -2294,10 +2296,8 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
QTAILQ_REMOVE(&s->ops, op, link);
QTAILQ_INSERT_TAIL(&s->free_ops, op, link);
s->nb_ops--;
-
-#ifdef CONFIG_PROFILER
- atomic_set(&s->prof.del_op_count, s->prof.del_op_count + 1);
-#endif
+ /* ? won't this end up op_opt - op = del_op_count ? */
+ s->prof.translation.del_op_count++;
}
static TCGOp *tcg_op_alloc(TCGOpcode opc)
@@ -3910,16 +3910,6 @@ void tcg_profile_snapshot(TCGProfile *prof, bool counters, bool table)
if (counters) {
PROF_ADD(prof, orig, cpu_exec_time);
- PROF_ADD(prof, orig, tb_count1);
- PROF_ADD(prof, orig, tb_count);
- PROF_ADD(prof, orig, op_count);
- PROF_MAX(prof, orig, op_count_max);
- PROF_ADD(prof, orig, temp_count);
- PROF_MAX(prof, orig, temp_count_max);
- PROF_ADD(prof, orig, del_op_count);
- PROF_ADD(prof, orig, code_in_len);
- PROF_ADD(prof, orig, code_out_len);
- PROF_ADD(prof, orig, search_out_len);
PROF_ADD(prof, orig, interm_time);
PROF_ADD(prof, orig, code_time);
PROF_ADD(prof, orig, la_time);
@@ -3997,26 +3987,9 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
TCGOp *op;
s->current_tb = tb;
-
-#ifdef CONFIG_PROFILER
- {
- int n = 0;
-
- QTAILQ_FOREACH(op, &s->ops, link) {
- n++;
- }
- atomic_set(&prof->op_count, prof->op_count + n);
- if (n > prof->op_count_max) {
- atomic_set(&prof->op_count_max, n);
- }
-
- n = s->nb_temps;
- atomic_set(&prof->temp_count, prof->temp_count + n);
- if (n > prof->temp_count_max) {
- atomic_set(&prof->temp_count_max, n);
- }
- }
-#endif
+ /* save pre-optimisation op count */
+ prof->translation.nb_ops_pre_opt = s->nb_ops;
+ prof->translation.temp_count = s->nb_temps;
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
@@ -4029,8 +4002,6 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
}
#endif
- /* save pre-optimisation op count */
- prof->translation.nb_ops_pre_opt = s->nb_ops;
#ifdef CONFIG_DEBUG_TCG
/* Ensure all labels referenced have been emitted. */
@@ -4210,70 +4181,16 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
return tcg_current_code_size(s);
}
-#ifdef CONFIG_PROFILER
void tcg_dump_info(void)
{
+ TCGProfile *s = NULL;
+#ifdef CONFIG_PROFILER
TCGProfile prof = {};
- const TCGProfile *s;
- int64_t tb_count;
- int64_t tb_div_count;
- int64_t tot;
-
tcg_profile_snapshot_counters(&prof);
s = &prof;
- tb_count = s->tb_count;
- tb_div_count = tb_count ? tb_count : 1;
- tot = s->interm_time + s->code_time;
-
- qemu_printf("JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n",
- tot, tot / 2.4e9);
- qemu_printf("translated TBs %" PRId64 " (aborted=%" PRId64
- " %0.1f%%)\n",
- tb_count, s->tb_count1 - tb_count,
- (double)(s->tb_count1 - s->tb_count)
- / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
- qemu_printf("avg ops/TB %0.1f max=%d\n",
- (double)s->op_count / tb_div_count, s->op_count_max);
- qemu_printf("deleted ops/TB %0.2f\n",
- (double)s->del_op_count / tb_div_count);
- qemu_printf("avg temps/TB %0.2f max=%d\n",
- (double)s->temp_count / tb_div_count, s->temp_count_max);
- qemu_printf("avg host code/TB %0.1f\n",
- (double)s->code_out_len / tb_div_count);
- qemu_printf("avg search data/TB %0.1f\n",
- (double)s->search_out_len / tb_div_count);
-
- qemu_printf("cycles/op %0.1f\n",
- s->op_count ? (double)tot / s->op_count : 0);
- qemu_printf("cycles/in byte %0.1f\n",
- s->code_in_len ? (double)tot / s->code_in_len : 0);
- qemu_printf("cycles/out byte %0.1f\n",
- s->code_out_len ? (double)tot / s->code_out_len : 0);
- qemu_printf("cycles/search byte %0.1f\n",
- s->search_out_len ? (double)tot / s->search_out_len : 0);
- if (tot == 0) {
- tot = 1;
- }
- qemu_printf(" gen_interm time %0.1f%%\n",
- (double)s->interm_time / tot * 100.0);
- qemu_printf(" gen_code time %0.1f%%\n",
- (double)s->code_time / tot * 100.0);
- qemu_printf("optim./code time %0.1f%%\n",
- (double)s->opt_time / (s->code_time ? s->code_time : 1)
- * 100.0);
- qemu_printf("liveness/code time %0.1f%%\n",
- (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
- qemu_printf("cpu_restore count %" PRId64 "\n",
- s->restore_count);
- qemu_printf(" avg cycles %0.1f\n",
- s->restore_count ? (double)s->restore_time / s->restore_count : 0);
-}
-#else
-void tcg_dump_info(void)
-{
- qemu_printf("[TCG profiler not compiled]\n");
-}
#endif
+ dump_jit_profile_info(s);
+}
#ifdef ELF_HOST_MACHINE
/* In order to use this feature, the backend needs to do three things:
@@ -573,16 +573,13 @@ typedef struct TCGProfile {
int nb_guest_insns;
int nb_spills;
int nb_ops_pre_opt;
+
+ int del_op_count;
+ int temp_count;
} translation;
int64_t cpu_exec_time;
- int64_t tb_count1;
- int64_t tb_count;
int64_t op_count; /* total insn count */
- int op_count_max; /* max insn per TB */
- int temp_count_max;
- int64_t temp_count;
- int64_t del_op_count;
int64_t code_in_len;
int64_t code_out_len;
int64_t search_out_len;