Message ID | 20201030004921.721096-19-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | Mirror map JIT memory for TCG | expand |
Unfortunately this crashes on iOS/Apple Silicon macOS. (lldb) bt * thread #19, stop reason = EXC_BAD_INSTRUCTION (code=1, subcode=0xd53b002a) * frame #0: 0x00000001169501e0 libqemu-x86_64-softmmu.utm.dylib`tcg_prologue_init + 760 ... (lldb) x/i 0x00000001169501e0 -> 0x1169501e0: 0xd53b002a mrs x10, CTR_EL0 I was able to fix it by adding #ifdef CONFIG_DARWIN extern void sys_icache_invalidate(void *start, size_t len); extern void sys_dcache_flush(void *start, size_t len); void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) { sys_dcache_flush((void *)rw, len); sys_icache_invalidate((void *)rx, len); } #else ... #endif Another thing, for x86 (and maybe other archs), the icache is cache coherent but does it apply if we are aliasing the memory address? I think in that case, it's like we're doing a DMA right and still need to do flushing+invalidating? -j On Thu, Oct 29, 2020 at 5:49 PM Richard Henderson <richard.henderson@linaro.org> wrote: > > Copy the single pointer implementation from libgcc and modify it to > support the double pointer interface we require. This halves the > number of cache operations required when split-rwx is enabled. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/aarch64/tcg-target.h | 11 +------- > tcg/aarch64/tcg-target.c.inc | 53 ++++++++++++++++++++++++++++++++++++ > 2 files changed, 54 insertions(+), 10 deletions(-) > > diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h > index fa64058d43..e62d38ba55 100644 > --- a/tcg/aarch64/tcg-target.h > +++ b/tcg/aarch64/tcg-target.h > @@ -148,16 +148,7 @@ typedef enum { > #define TCG_TARGET_DEFAULT_MO (0) > #define TCG_TARGET_HAS_MEMORY_BSWAP 1 > > -/* Flush the dcache at RW, and the icache at RX, as necessary. */ > -static inline void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) > -{ > - /* TODO: Copy this from gcc to avoid 4 loops instead of 2. */ > - if (rw != rx) { > - __builtin___clear_cache((char *)rw, (char *)(rw + len)); > - } > - __builtin___clear_cache((char *)rx, (char *)(rx + len)); > -} > - > +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len); > void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t); > > #ifdef CONFIG_SOFTMMU > diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc > index bd888bc66d..5e8f3faad2 100644 > --- a/tcg/aarch64/tcg-target.c.inc > +++ b/tcg/aarch64/tcg-target.c.inc > @@ -2968,3 +2968,56 @@ void tcg_register_jit(const void *buf, size_t buf_size) > { > tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); > } > + > +/* > + * Flush the dcache at RW, and the icache at RX, as necessary. > + * This is a copy of gcc's __aarch64_sync_cache_range, modified > + * to fit this three-operand interface. > + */ > +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) > +{ > + const unsigned CTR_IDC = 1u << 28; > + const unsigned CTR_DIC = 1u << 29; > + static unsigned int cache_info; > + uintptr_t icache_lsize, dcache_lsize, p; > + > + if (!cache_info) { > + /* > + * CTR_EL0 [3:0] contains log2 of icache line size in words. > + * CTR_EL0 [19:16] contains log2 of dcache line size in words. > + */ > + asm volatile("mrs\t%0, ctr_el0" : "=r"(cache_info)); > + } > + > + icache_lsize = 4 << extract32(cache_info, 0, 4); > + dcache_lsize = 4 << extract32(cache_info, 16, 4); > + > + /* > + * If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification > + * is not required for instruction to data coherence. > + */ > + if (!(cache_info & CTR_IDC)) { > + /* > + * Loop over the address range, clearing one cache line at once. > + * Data cache must be flushed to unification first to make sure > + * the instruction cache fetches the updated data. > + */ > + for (p = rw & -dcache_lsize; p < rw + len; p += dcache_lsize) { > + asm volatile("dc\tcvau, %0" : : "r" (p) : "memory"); > + } > + asm volatile("dsb\tish" : : : "memory"); > + } > + > + /* > + * If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point > + * of Unification is not required for instruction to data coherence. > + */ > + if (!(cache_info & CTR_DIC)) { > + for (p = rx & -icache_lsize; p < rx + len; p += icache_lsize) { > + asm volatile("ic\tivau, %0" : : "r"(p) : "memory"); > + } > + asm volatile ("dsb\tish" : : : "memory"); > + } > + > + asm volatile("isb" : : : "memory"); > +} > -- > 2.25.1 >
On 10/31/20 6:25 PM, Joelle van Dyne wrote: > Another thing, for x86 (and maybe other archs), the icache is cache > coherent but does it apply if we are aliasing the memory address? I > think in that case, it's like we're doing a DMA right and still need > to do flushing+invalidating? No, it is not like dma. The x86 caches are physically tagged, so virtual aliasing does not matter. r~
On 10/31/20 6:25 PM, Joelle van Dyne wrote: > Unfortunately this crashes on iOS/Apple Silicon macOS. > > (lldb) bt > * thread #19, stop reason = EXC_BAD_INSTRUCTION (code=1, subcode=0xd53b002a) > * frame #0: 0x00000001169501e0 > libqemu-x86_64-softmmu.utm.dylib`tcg_prologue_init + 760 > ... > (lldb) x/i 0x00000001169501e0 > -> 0x1169501e0: 0xd53b002a mrs x10, CTR_EL0 That is *really* annoying. Why in the world would Apple not set SCTLR_ELx.UCT? There's nothing that the OS can do better than the application for ARMv8.0-A. Oh well. I'll paste your code in. r~
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index fa64058d43..e62d38ba55 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -148,16 +148,7 @@ typedef enum { #define TCG_TARGET_DEFAULT_MO (0) #define TCG_TARGET_HAS_MEMORY_BSWAP 1 -/* Flush the dcache at RW, and the icache at RX, as necessary. */ -static inline void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) -{ - /* TODO: Copy this from gcc to avoid 4 loops instead of 2. */ - if (rw != rx) { - __builtin___clear_cache((char *)rw, (char *)(rw + len)); - } - __builtin___clear_cache((char *)rx, (char *)(rx + len)); -} - +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len); void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t); #ifdef CONFIG_SOFTMMU diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc index bd888bc66d..5e8f3faad2 100644 --- a/tcg/aarch64/tcg-target.c.inc +++ b/tcg/aarch64/tcg-target.c.inc @@ -2968,3 +2968,56 @@ void tcg_register_jit(const void *buf, size_t buf_size) { tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame)); } + +/* + * Flush the dcache at RW, and the icache at RX, as necessary. + * This is a copy of gcc's __aarch64_sync_cache_range, modified + * to fit this three-operand interface. + */ +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + const unsigned CTR_IDC = 1u << 28; + const unsigned CTR_DIC = 1u << 29; + static unsigned int cache_info; + uintptr_t icache_lsize, dcache_lsize, p; + + if (!cache_info) { + /* + * CTR_EL0 [3:0] contains log2 of icache line size in words. + * CTR_EL0 [19:16] contains log2 of dcache line size in words. + */ + asm volatile("mrs\t%0, ctr_el0" : "=r"(cache_info)); + } + + icache_lsize = 4 << extract32(cache_info, 0, 4); + dcache_lsize = 4 << extract32(cache_info, 16, 4); + + /* + * If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification + * is not required for instruction to data coherence. + */ + if (!(cache_info & CTR_IDC)) { + /* + * Loop over the address range, clearing one cache line at once. + * Data cache must be flushed to unification first to make sure + * the instruction cache fetches the updated data. + */ + for (p = rw & -dcache_lsize; p < rw + len; p += dcache_lsize) { + asm volatile("dc\tcvau, %0" : : "r" (p) : "memory"); + } + asm volatile("dsb\tish" : : : "memory"); + } + + /* + * If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point + * of Unification is not required for instruction to data coherence. + */ + if (!(cache_info & CTR_DIC)) { + for (p = rx & -icache_lsize; p < rx + len; p += icache_lsize) { + asm volatile("ic\tivau, %0" : : "r"(p) : "memory"); + } + asm volatile ("dsb\tish" : : : "memory"); + } + + asm volatile("isb" : : : "memory"); +}
Copy the single pointer implementation from libgcc and modify it to support the double pointer interface we require. This halves the number of cache operations required when split-rwx is enabled. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/aarch64/tcg-target.h | 11 +------- tcg/aarch64/tcg-target.c.inc | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 10 deletions(-)