[v6,49/50] tcg: introduce regions to split code_gen_buffer

Message ID	20171016172609.23422-50-richard.henderson@linaro.org
State	Superseded
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of qemu-devel-bounces+patch=linaro.org@nongnu.org designates 2001:4830:134:3::11 as permitted sender) client-ip=2001:4830:134:3::11; From: Richard Henderson <richard.henderson@linaro.org> To: qemu-devel@nongnu.org Date: Mon, 16 Oct 2017 10:26:08 -0700 Message-Id: <20171016172609.23422-50-richard.henderson@linaro.org> In-Reply-To: <20171016172609.23422-1-richard.henderson@linaro.org> References: <20171016172609.23422-1-richard.henderson@linaro.org> Subject: [Qemu-devel] [PATCH v6 49/50] tcg: introduce regions to split code_gen_buffer Precedence: list Cc: cota@braap.org Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>
Series	tcg tb_lock removal \| expand [v6,00/50] tcg tb_lock removal [v6,01/50] tcg: Merge opcode arguments into TCGOp [v6,02/50] tcg: Propagate args to op->args in optimizer [v6,03/50] tcg: Propagate args to op->args in tcg.c [v6,04/50] tcg: Propagate TCGOp down to allocators [v6,05/50] tcg: Introduce arg_temp [v6,06/50] tcg: Add temp_global bit to TCGTemp [v6,07/50] tcg: Return NULL temp for TCG_CALL_DUMMY_ARG [v6,08/50] tcg: Introduce temp_arg [v6,09/50] tcg: Use per-temp state data in liveness [v6,10/50] tcg: Avoid loops against variable bounds [v6,11/50] tcg: Change temp_allocate_frame arg to TCGTemp [v6,12/50] tcg: Remove unused TCG_CALL_DUMMY_TCGV [v6,13/50] tcg: Export temp_idx [v6,14/50] tcg: Use per-temp state data in optimize [v6,15/50] tcg: Push tcg_ctx into generator functions [v6,16/50] tcg: Push tcg_ctx into tcg_gen_callN [v6,17/50] tcg: Introduce index_arg [v6,18/50] tcg: Reserve temporary index 0 [v6,19/50] target/alpha: Avoid translate_init unless tcg_enabled [v6,20/50] qom: Introduce CPUClass.tcg_initialize [v6,21/50] tcg: Use pointers in TCGOp->args [v6,22/50] tcg: define CF_PARALLEL and use it for TB hashing along with CF_COUNT_MASK [v6,23/50] hack dump tb->flags and tb->cflags [v6,24/50] tcg: Add CPUState step_next_tb [v6,25/50] tcg: Include CF_COUNT_MASK in CF_HASH_MASK [v6,26/50] tcg: convert tb->cflags reads to tb_cflags(tb) [v6,27/50] target/arm: check CF_PARALLEL instead of parallel_cpus [v6,28/50] target/hppa: check CF_PARALLEL instead of parallel_cpus [v6,29/50] target/i386: check CF_PARALLEL instead of parallel_cpus [v6,30/50] target/m68k: check CF_PARALLEL instead of parallel_cpus [v6,31/50] target/s390x: check CF_PARALLEL instead of parallel_cpus [v6,32/50] target/sh4: check CF_PARALLEL instead of parallel_cpus [v6,33/50] target/sparc: check CF_PARALLEL instead of parallel_cpus [v6,34/50] tcg: check CF_PARALLEL instead of parallel_cpus [v6,35/50] cpu-exec: lookup/generate TB outside exclusive region during step_atomic [v6,36/50] tcg: Add CF_LAST_IO + CF_USE_ICOUNT to CF_HASH_MASK [v6,37/50] tcg: Remove CF_IGNORE_ICOUNT [v6,38/50] translate-all: use a binary search tree to track TBs in TBContext [v6,39/50] exec-all: rename tb_free to tb_remove [v6,40/50] translate-all: report correct avg host TB size [v6,41/50] tcg: take tb_ctx out of TCGContext [v6,42/50] tcg: define tcg_init_ctx and make tcg_ctx a pointer [v6,43/50] gen-icount: fold exitreq_label into TCGContext [v6,44/50] tcg: introduce **tcg_ctxs to keep track of all TCGContext's [v6,45/50] tcg: distribute profiling counters across TCGContext's [v6,46/50] tcg: allocate optimizer temps with tcg_malloc [v6,47/50] osdep: introduce qemu_mprotect_rwx/none [v6,48/50] translate-all: use qemu_protect_rwx/none helpers [v6,49/50] tcg: introduce regions to split code_gen_buffer [v6,50/50] tcg: enable multiple TCG contexts in softmmu

diff --git a/tcg/tcg.h b/tcg/tcg.h index 50ebe76aca..d3d16a2cce 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -810,6 +810,12 @@ void *tcg_malloc_internal(TCGContext *s, int size); void tcg_pool_reset(TCGContext *s); TranslationBlock *tcg_tb_alloc(TCGContext *s); +void tcg_region_init(void); +void tcg_region_reset_all(void); + +size_t tcg_code_size(void); +size_t tcg_code_capacity(void); + /* Called with tb_lock held. */ static inline void *tcg_malloc(int size) { diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index cc25b7555b..66b1733911 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -608,15 +608,13 @@ static inline void *alloc_code_gen_buffer(void) { void *buf = static_code_gen_buffer; void *end = static_code_gen_buffer + sizeof(static_code_gen_buffer); - size_t full_size, size; + size_t size; /* page-align the beginning and end of the buffer */ buf = QEMU_ALIGN_PTR_UP(buf, qemu_real_host_page_size); end = QEMU_ALIGN_PTR_DOWN(end, qemu_real_host_page_size); - /* Reserve a guard page. */ - full_size = end - buf; - size = full_size - qemu_real_host_page_size; + size = end - buf; /* Honor a command-line option limiting the size of the buffer. */ if (size > tcg_ctx->code_gen_buffer_size) { @@ -635,9 +633,6 @@ static inline void *alloc_code_gen_buffer(void) if (qemu_mprotect_rwx(buf, size)) { abort(); } - if (qemu_mprotect_none(buf + size, qemu_real_host_page_size)) { - abort(); - } qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); return buf; @@ -646,22 +641,16 @@ static inline void *alloc_code_gen_buffer(void) static inline void *alloc_code_gen_buffer(void) { size_t size = tcg_ctx->code_gen_buffer_size; - void *buf1, *buf2; - - /* Perform the allocation in two steps, so that the guard page - is reserved but uncommitted. */ - buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size, - MEM_RESERVE, PAGE_NOACCESS); - if (buf1 != NULL) { - buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); - assert(buf1 == buf2); - } + void *buf; - return buf1; + buf = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, + PAGE_EXECUTE_READWRITE); + return buf; } #else static inline void *alloc_code_gen_buffer(void) { + int prot = PROT_WRITE | PROT_READ | PROT_EXEC; int flags = MAP_PRIVATE | MAP_ANONYMOUS; uintptr_t start = 0; size_t size = tcg_ctx->code_gen_buffer_size; @@ -695,8 +684,7 @@ static inline void *alloc_code_gen_buffer(void) # endif # endif - buf = mmap((void *)start, size + qemu_real_host_page_size, - PROT_NONE, flags, -1, 0); + buf = mmap((void *)start, size, prot, flags, -1, 0); if (buf == MAP_FAILED) { return NULL; } @@ -706,24 +694,23 @@ static inline void *alloc_code_gen_buffer(void) /* Try again, with the original still mapped, to avoid re-acquiring that 256mb crossing. This time don't specify an address. */ size_t size2; - void *buf2 = mmap(NULL, size + qemu_real_host_page_size, - PROT_NONE, flags, -1, 0); + void *buf2 = mmap(NULL, size, prot, flags, -1, 0); switch ((int)(buf2 != MAP_FAILED)) { case 1: if (!cross_256mb(buf2, size)) { /* Success! Use the new buffer. */ - munmap(buf, size + qemu_real_host_page_size); + munmap(buf, size); break; } /* Failure. Work with what we had. */ - munmap(buf2, size + qemu_real_host_page_size); + munmap(buf2, size); /* fallthru */ default: /* Split the original buffer. Free the smaller half. */ buf2 = split_cross_256mb(buf, size); size2 = tcg_ctx->code_gen_buffer_size; if (buf == buf2) { - munmap(buf + size2 + qemu_real_host_page_size, size - size2); + munmap(buf + size2, size - size2); } else { munmap(buf, size - size2); } @@ -734,10 +721,6 @@ static inline void *alloc_code_gen_buffer(void) } #endif - /* Make the final buffer accessible. The guard page at the end - will remain inaccessible with PROT_NONE. */ - mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC); - /* Request large pages for the buffer. */ qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); @@ -918,13 +901,8 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) size_t host_size = 0; g_tree_foreach(tb_ctx.tb_tree, tb_host_size_iter, &host_size); - printf("qemu: flush code_size=%td nb_tbs=%zu avg_tb_size=%zu\n", - tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, nb_tbs, - nb_tbs > 0 ? host_size / nb_tbs : 0); - } - if ((unsigned long)(tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer) - > tcg_ctx->code_gen_buffer_size) { - cpu_abort(cpu, "Internal error: code buffer overflow\n"); + printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n", + tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0); } CPU_FOREACH(cpu) { @@ -938,7 +916,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); - tcg_ctx->code_gen_ptr = tcg_ctx->code_gen_buffer; + tcg_region_reset_all(); /* XXX: flush processor icache at this point if cache flush is expensive */ atomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1); @@ -1276,9 +1254,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu, phys_pc = get_page_addr_code(env, pc); + buffer_overflow: tb = tb_alloc(pc); if (unlikely(!tb)) { - buffer_overflow: /* flush must be done */ tb_flush(cpu); mmap_unlock(); @@ -1382,9 +1360,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu, } #endif - tcg_ctx->code_gen_ptr = (void *) + atomic_set(&tcg_ctx->code_gen_ptr, (void *) ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size, - CODE_GEN_ALIGN); + CODE_GEN_ALIGN)); /* init jump list */ assert(((uintptr_t)tb & 3) == 0); @@ -1916,9 +1894,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) * otherwise users might think "-tb-size" is not honoured. * For avg host size we use the precise numbers from tb_tree_stats though. */ - cpu_fprintf(f, "gen code size %td/%zd\n", - tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, - tcg_ctx->code_gen_highwater - tcg_ctx->code_gen_buffer); + cpu_fprintf(f, "gen code size %zu/%zu\n", + tcg_code_size(), tcg_code_capacity()); cpu_fprintf(f, "TB count %zu\n", nb_tbs); cpu_fprintf(f, "TB avg target size %zu max=%zu bytes\n", nb_tbs ? tst.target_size / nb_tbs : 0, diff --git a/bsd-user/main.c b/bsd-user/main.c index 392c0ed5fb..f1b244b59b 100644 --- a/bsd-user/main.c +++ b/bsd-user/main.c @@ -978,6 +978,7 @@ int main(int argc, char **argv) generating the prologue until now so that the prologue can take the real value of GUEST_BASE into account. */ tcg_prologue_init(tcg_ctx); + tcg_region_init(); /* build Task State */ memset(ts, 0, sizeof(TaskState)); diff --git a/cpus.c b/cpus.c index c9a624003a..8e06257a74 100644 --- a/cpus.c +++ b/cpus.c @@ -1664,6 +1664,18 @@ static void qemu_tcg_init_vcpu(CPUState *cpu) char thread_name[VCPU_THREAD_NAME_SIZE]; static QemuCond *single_tcg_halt_cond; static QemuThread *single_tcg_cpu_thread; + static int tcg_region_inited; + + /* + * Initialize TCG regions--once. Now is a good time, because: + * (1) TCG's init context, prologue and target globals have been set up. + * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the + * -accel flag is processed, so the check doesn't work then). + */ + if (!tcg_region_inited) { + tcg_region_inited = 1; + tcg_region_init(); + } if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) { cpu->thread = g_malloc0(sizeof(QemuThread)); diff --git a/linux-user/main.c b/linux-user/main.c index 5f40c1a702..199d71ecbb 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -4458,6 +4458,7 @@ int main(int argc, char **argv, char **envp) generating the prologue until now so that the prologue can take the real value of GUEST_BASE into account. */ tcg_prologue_init(tcg_ctx); + tcg_region_init(); #if defined(TARGET_I386) env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK; diff --git a/tcg/tcg.c b/tcg/tcg.c index 4b7dc800ec..1ce1b08525 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -121,6 +121,30 @@ static bool tcg_out_ldst_finalize(TCGContext *s); static TCGContext **tcg_ctxs; static unsigned int n_tcg_ctxs; +/* + * We divide code_gen_buffer into equally-sized "regions" that TCG threads + * dynamically allocate from as demand dictates. Given appropriate region + * sizing, this minimizes flushes even when some TCG threads generate a lot + * more code than others. + */ +struct tcg_region_state { + QemuMutex lock; + + /* fields set at init time */ + void *start; + void *start_aligned; + void *end; + size_t n; + size_t size; /* size of one region */ + size_t stride; /* .size + guard size */ + + /* fields protected by the lock */ + size_t current; /* current region index */ + size_t agg_size_full; /* aggregate size of full regions */ +}; + +static struct tcg_region_state region; + static TCGRegSet tcg_target_available_regs[2]; static TCGRegSet tcg_target_call_clobber_regs; @@ -258,6 +282,196 @@ TCGLabel *gen_new_label(void) #include "tcg-target.inc.c" +static void tcg_region_bounds(size_t curr_region, void **pstart, void **pend) +{ + void *start, *end; + + start = region.start_aligned + curr_region * region.stride; + end = start + region.size; + + if (curr_region == 0) { + start = region.start; + } + if (curr_region == region.n - 1) { + end = region.end; + } + + *pstart = start; + *pend = end; +} + +static void tcg_region_assign(TCGContext *s, size_t curr_region) +{ + void *start, *end; + + tcg_region_bounds(curr_region, &start, &end); + + s->code_gen_buffer = start; + s->code_gen_ptr = start; + s->code_gen_buffer_size = end - start; + s->code_gen_highwater = end - TCG_HIGHWATER; +} + +static bool tcg_region_alloc__locked(TCGContext *s) +{ + if (region.current == region.n) { + return true; + } + tcg_region_assign(s, region.current); + region.current++; + return false; +} + +/* + * Request a new region once the one in use has filled up. + * Returns true on error. + */ +static bool tcg_region_alloc(TCGContext *s) +{ + bool err; + /* read the region size now; alloc__locked will overwrite it on success */ + size_t size_full = s->code_gen_buffer_size; + + qemu_mutex_lock(&region.lock); + err = tcg_region_alloc__locked(s); + if (!err) { + region.agg_size_full += size_full - TCG_HIGHWATER; + } + qemu_mutex_unlock(&region.lock); + return err; +} + +/* + * Perform a context's first region allocation. + * This function does _not_ increment region.agg_size_full. + */ +static inline bool tcg_region_initial_alloc__locked(TCGContext *s) +{ + return tcg_region_alloc__locked(s); +} + +/* Call from a safe-work context */ +void tcg_region_reset_all(void) +{ + unsigned int i; + + qemu_mutex_lock(&region.lock); + region.current = 0; + region.agg_size_full = 0; + + for (i = 0; i < n_tcg_ctxs; i++) { + bool err = tcg_region_initial_alloc__locked(tcg_ctxs[i]); + + g_assert(!err); + } + qemu_mutex_unlock(&region.lock); +} + +/* + * Initializes region partitioning. + * + * Called at init time from the parent thread (i.e. the one calling + * tcg_context_init), after the target's TCG globals have been set. + */ +void tcg_region_init(void) +{ + void *buf = tcg_init_ctx.code_gen_buffer; + void *aligned; + size_t size = tcg_init_ctx.code_gen_buffer_size; + size_t page_size = qemu_real_host_page_size; + size_t region_size; + size_t n_regions; + size_t i; + + /* We do not yet support multiple TCG contexts, so use one region for now */ + n_regions = 1; + + /* The first region will be 'aligned - buf' bytes larger than the others */ + aligned = QEMU_ALIGN_PTR_UP(buf, page_size); + g_assert(aligned < tcg_init_ctx.code_gen_buffer + size); + /* + * Make region_size a multiple of page_size, using aligned as the start. + * As a result of this we might end up with a few extra pages at the end of + * the buffer; we will assign those to the last region. + */ + region_size = (size - (aligned - buf)) / n_regions; + region_size = QEMU_ALIGN_DOWN(region_size, page_size); + + /* A region must have at least 2 pages; one code, one guard */ + g_assert(region_size >= 2 * page_size); + + /* init the region struct */ + qemu_mutex_init(&region.lock); + region.n = n_regions; + region.size = region_size - page_size; + region.stride = region_size; + region.start = buf; + region.start_aligned = aligned; + /* page-align the end, since its last page will be a guard page */ + region.end = QEMU_ALIGN_PTR_DOWN(buf + size, page_size); + /* account for that last guard page */ + region.end -= page_size; + + /* set guard pages */ + for (i = 0; i < region.n; i++) { + void *start, *end; + int rc; + + tcg_region_bounds(i, &start, &end); + rc = qemu_mprotect_none(end, page_size); + g_assert(!rc); + } + + /* We do not yet support multiple TCG contexts so allocate the region now */ + { + bool err = tcg_region_initial_alloc__locked(tcg_ctx); + + g_assert(!err); + } +} + +/* + * Returns the size (in bytes) of all translated code (i.e. from all regions) + * currently in the cache. + * See also: tcg_code_capacity() + * Do not confuse with tcg_current_code_size(); that one applies to a single + * TCG context. + */ +size_t tcg_code_size(void) +{ + unsigned int i; + size_t total; + + qemu_mutex_lock(&region.lock); + total = region.agg_size_full; + for (i = 0; i < n_tcg_ctxs; i++) { + const TCGContext *s = tcg_ctxs[i]; + size_t size; + + size = atomic_read(&s->code_gen_ptr) - s->code_gen_buffer; + g_assert(size <= s->code_gen_buffer_size); + total += size; + } + qemu_mutex_unlock(&region.lock); + return total; +} + +/* + * Returns the code capacity (in bytes) of the entire cache, i.e. including all + * regions. + * See also: tcg_code_size() + */ +size_t tcg_code_capacity(void) +{ + size_t guard_size, capacity; + + /* no need for synchronization; these variables are set at init time */ + guard_size = region.stride - region.size; + capacity = region.end + guard_size - region.start; + capacity -= region.n * (guard_size + TCG_HIGHWATER); + return capacity; +} + /* pool based memory allocation */ void *tcg_malloc_internal(TCGContext *s, int size) { @@ -404,13 +618,17 @@ TranslationBlock *tcg_tb_alloc(TCGContext *s) TranslationBlock *tb; void *next; + retry: tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align); next = (void *)ROUND_UP((uintptr_t)(tb + 1), align); if (unlikely(next > s->code_gen_highwater)) { - return NULL; + if (tcg_region_alloc(s)) { + return NULL; + } + goto retry; } - s->code_gen_ptr = next; + atomic_set(&s->code_gen_ptr, next); s->data_gen_ptr = NULL; return tb; }

[v6,49/50] tcg: introduce regions to split code_gen_buffer

Commit Message

Patch