Message ID | 20241114160131.48616-32-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | accel/tcg: Convert victim tlb to IntervalTree | expand |
On 11/14/24 08:01, Richard Henderson wrote: > Because translation is special, we don't need the speed > of the direct-mapped softmmu tlb. We cache a lookups in > DisasContextBase within the translator loop anyway. > > Drop the addr_code comparator from CPUTLBEntry. > Go directly to the IntervalTree for MMU_INST_FETCH. > Derive exec flags from read flags. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > include/exec/cpu-all.h | 3 ++ > include/exec/tlb-common.h | 5 ++- > accel/tcg/cputlb.c | 76 ++++++++++++++++++++++++--------------- > 3 files changed, 52 insertions(+), 32 deletions(-) > > diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h > index 45e6676938..ad160c328a 100644 > --- a/include/exec/cpu-all.h > +++ b/include/exec/cpu-all.h > @@ -339,6 +339,9 @@ static inline int cpu_mmu_index(CPUState *cs, bool ifetch) > (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \ > | TLB_FORCE_SLOW | TLB_DISCARD_WRITE) > > +/* Filter read flags to exec flags. */ > +#define TLB_EXEC_FLAGS_MASK (TLB_MMIO) > + > /* > * Flags stored in CPUTLBEntryFull.slow_flags[x]. > * TLB_FORCE_SLOW must be set in CPUTLBEntry.addr_idx[x]. > diff --git a/include/exec/tlb-common.h b/include/exec/tlb-common.h > index 300f9fae67..feaa471299 100644 > --- a/include/exec/tlb-common.h > +++ b/include/exec/tlb-common.h > @@ -26,7 +26,6 @@ typedef union CPUTLBEntry { > struct { > uint64_t addr_read; > uint64_t addr_write; > - uint64_t addr_code; > /* > * Addend to virtual address to get host address. IO accesses > * use the corresponding iotlb value. > @@ -35,7 +34,7 @@ typedef union CPUTLBEntry { > }; > /* > * Padding to get a power of two size, as well as index > - * access to addr_{read,write,code}. > + * access to addr_{read,write}. > */ > uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)]; > } CPUTLBEntry; > @@ -92,7 +91,7 @@ struct CPUTLBEntryFull { > * Additional tlb flags for use by the slow path. If non-zero, > * the corresponding CPUTLBEntry comparator must have TLB_FORCE_SLOW. > */ > - uint8_t slow_flags[MMU_ACCESS_COUNT]; > + uint8_t slow_flags[2]; > > /* > * Allow target-specific additions to this structure. > diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c > index 981098a6f2..be2ea1bc70 100644 > --- a/accel/tcg/cputlb.c > +++ b/accel/tcg/cputlb.c > @@ -114,8 +114,9 @@ static inline uint64_t tlb_read_idx(const CPUTLBEntry *entry, > MMU_DATA_LOAD * sizeof(uint64_t)); > QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) != > MMU_DATA_STORE * sizeof(uint64_t)); > - QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) != > - MMU_INST_FETCH * sizeof(uint64_t)); > + > + tcg_debug_assert(access_type == MMU_DATA_LOAD || > + access_type == MMU_DATA_STORE); > > #if TARGET_LONG_BITS == 32 > /* Use qatomic_read, in case of addr_write; only care about low bits. */ > @@ -480,8 +481,7 @@ static bool tlb_hit_page_mask_anyprot(CPUTLBEntry *tlb_entry, > mask &= TARGET_PAGE_MASK | TLB_INVALID_MASK; > > return (page == (tlb_entry->addr_read & mask) || > - page == (tlb_addr_write(tlb_entry) & mask) || > - page == (tlb_entry->addr_code & mask)); > + page == (tlb_addr_write(tlb_entry) & mask)); > } > > /* Called with tlb_c.lock held */ > @@ -1184,9 +1184,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx, > /* Now calculate the new entry */ > node->copy.addend = addend - addr_page; > > - tlb_set_compare(full, &node->copy, addr_page, read_flags, > - MMU_INST_FETCH, prot & PAGE_EXEC); > - > if (wp_flags & BP_MEM_READ) { > read_flags |= TLB_WATCHPOINT; > } > @@ -1308,22 +1305,30 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, > /* Primary lookup in the fast tlb. */ > entry = tlbfast_entry(fast, addr); > full = &desc->fulltlb[tlbfast_index(fast, addr)]; > - cmp = tlb_read_idx(entry, access_type); > - if (tlb_hit(cmp, addr)) { > - goto found; > + if (access_type != MMU_INST_FETCH) { > + cmp = tlb_read_idx(entry, access_type); > + if (tlb_hit(cmp, addr)) { > + goto found_data; > + } > } > > /* Secondary lookup in the IntervalTree. */ > node = tlbtree_lookup_addr(desc, addr); > if (node) { > - cmp = tlb_read_idx(&node->copy, access_type); > - if (tlb_hit(cmp, addr)) { > - /* Install the cached entry. */ > - qemu_spin_lock(&cpu->neg.tlb.c.lock); > - copy_tlb_helper_locked(entry, &node->copy); > - qemu_spin_unlock(&cpu->neg.tlb.c.lock); > - *full = node->full; > - goto found; > + if (access_type == MMU_INST_FETCH) { > + if (node->full.prot & PAGE_EXEC) { > + goto found_code; > + } > + } else { > + cmp = tlb_read_idx(&node->copy, access_type); > + if (tlb_hit(cmp, addr)) { > + /* Install the cached entry. */ > + qemu_spin_lock(&cpu->neg.tlb.c.lock); > + copy_tlb_helper_locked(entry, &node->copy); > + qemu_spin_unlock(&cpu->neg.tlb.c.lock); > + *full = node->full; > + goto found_data; > + } > } > } > > @@ -1333,9 +1338,14 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, > tcg_debug_assert(probe); > return false; > } > - > o->did_tlb_fill = true; > > + if (access_type == MMU_INST_FETCH) { > + node = tlbtree_lookup_addr(desc, addr); > + tcg_debug_assert(node); > + goto found_code; > + } > + > entry = tlbfast_entry(fast, addr); > full = &desc->fulltlb[tlbfast_index(fast, addr)]; > cmp = tlb_read_idx(entry, access_type); > @@ -1345,14 +1355,29 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, > * called tlb_fill_align, so we know that this entry *is* valid. > */ > flags &= ~TLB_INVALID_MASK; > + goto found_data; > + > + found_data: > + flags &= cmp; > + flags |= full->slow_flags[access_type]; > + o->flags = flags; > + o->full = *full; > + o->haddr = (void *)((uintptr_t)addr + entry->addend); > goto done; > > - found: > - /* Alignment has not been checked by tlb_fill_align. */ > - { > + found_code: > + o->flags = node->copy.addr_read & TLB_EXEC_FLAGS_MASK; > + o->full = node->full; > + o->haddr = (void *)((uintptr_t)addr + node->copy.addend); > + goto done; > + > + done: > + if (!o->did_tlb_fill) { > int a_bits = memop_alignment_bits(memop); > > /* > + * Alignment has not been checked by tlb_fill_align. > + * > * The TLB_CHECK_ALIGNED check differs from the normal alignment > * check, in that this is based on the atomicity of the operation. > * The intended use case is the ARM memory type field of each PTE, > @@ -1366,13 +1391,6 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, > cpu_unaligned_access(cpu, addr, access_type, i->mmu_idx, i->ra); > } > } > - > - done: > - flags &= cmp; > - flags |= full->slow_flags[access_type]; > - o->flags = flags; > - o->full = *full; > - o->haddr = (void *)((uintptr_t)addr + entry->addend); > return true; > } > Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 45e6676938..ad160c328a 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -339,6 +339,9 @@ static inline int cpu_mmu_index(CPUState *cs, bool ifetch) (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \ | TLB_FORCE_SLOW | TLB_DISCARD_WRITE) +/* Filter read flags to exec flags. */ +#define TLB_EXEC_FLAGS_MASK (TLB_MMIO) + /* * Flags stored in CPUTLBEntryFull.slow_flags[x]. * TLB_FORCE_SLOW must be set in CPUTLBEntry.addr_idx[x]. diff --git a/include/exec/tlb-common.h b/include/exec/tlb-common.h index 300f9fae67..feaa471299 100644 --- a/include/exec/tlb-common.h +++ b/include/exec/tlb-common.h @@ -26,7 +26,6 @@ typedef union CPUTLBEntry { struct { uint64_t addr_read; uint64_t addr_write; - uint64_t addr_code; /* * Addend to virtual address to get host address. IO accesses * use the corresponding iotlb value. @@ -35,7 +34,7 @@ typedef union CPUTLBEntry { }; /* * Padding to get a power of two size, as well as index - * access to addr_{read,write,code}. + * access to addr_{read,write}. */ uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)]; } CPUTLBEntry; @@ -92,7 +91,7 @@ struct CPUTLBEntryFull { * Additional tlb flags for use by the slow path. If non-zero, * the corresponding CPUTLBEntry comparator must have TLB_FORCE_SLOW. */ - uint8_t slow_flags[MMU_ACCESS_COUNT]; + uint8_t slow_flags[2]; /* * Allow target-specific additions to this structure. diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 981098a6f2..be2ea1bc70 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -114,8 +114,9 @@ static inline uint64_t tlb_read_idx(const CPUTLBEntry *entry, MMU_DATA_LOAD * sizeof(uint64_t)); QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) != MMU_DATA_STORE * sizeof(uint64_t)); - QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) != - MMU_INST_FETCH * sizeof(uint64_t)); + + tcg_debug_assert(access_type == MMU_DATA_LOAD || + access_type == MMU_DATA_STORE); #if TARGET_LONG_BITS == 32 /* Use qatomic_read, in case of addr_write; only care about low bits. */ @@ -480,8 +481,7 @@ static bool tlb_hit_page_mask_anyprot(CPUTLBEntry *tlb_entry, mask &= TARGET_PAGE_MASK | TLB_INVALID_MASK; return (page == (tlb_entry->addr_read & mask) || - page == (tlb_addr_write(tlb_entry) & mask) || - page == (tlb_entry->addr_code & mask)); + page == (tlb_addr_write(tlb_entry) & mask)); } /* Called with tlb_c.lock held */ @@ -1184,9 +1184,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx, /* Now calculate the new entry */ node->copy.addend = addend - addr_page; - tlb_set_compare(full, &node->copy, addr_page, read_flags, - MMU_INST_FETCH, prot & PAGE_EXEC); - if (wp_flags & BP_MEM_READ) { read_flags |= TLB_WATCHPOINT; } @@ -1308,22 +1305,30 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, /* Primary lookup in the fast tlb. */ entry = tlbfast_entry(fast, addr); full = &desc->fulltlb[tlbfast_index(fast, addr)]; - cmp = tlb_read_idx(entry, access_type); - if (tlb_hit(cmp, addr)) { - goto found; + if (access_type != MMU_INST_FETCH) { + cmp = tlb_read_idx(entry, access_type); + if (tlb_hit(cmp, addr)) { + goto found_data; + } } /* Secondary lookup in the IntervalTree. */ node = tlbtree_lookup_addr(desc, addr); if (node) { - cmp = tlb_read_idx(&node->copy, access_type); - if (tlb_hit(cmp, addr)) { - /* Install the cached entry. */ - qemu_spin_lock(&cpu->neg.tlb.c.lock); - copy_tlb_helper_locked(entry, &node->copy); - qemu_spin_unlock(&cpu->neg.tlb.c.lock); - *full = node->full; - goto found; + if (access_type == MMU_INST_FETCH) { + if (node->full.prot & PAGE_EXEC) { + goto found_code; + } + } else { + cmp = tlb_read_idx(&node->copy, access_type); + if (tlb_hit(cmp, addr)) { + /* Install the cached entry. */ + qemu_spin_lock(&cpu->neg.tlb.c.lock); + copy_tlb_helper_locked(entry, &node->copy); + qemu_spin_unlock(&cpu->neg.tlb.c.lock); + *full = node->full; + goto found_data; + } } } @@ -1333,9 +1338,14 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, tcg_debug_assert(probe); return false; } - o->did_tlb_fill = true; + if (access_type == MMU_INST_FETCH) { + node = tlbtree_lookup_addr(desc, addr); + tcg_debug_assert(node); + goto found_code; + } + entry = tlbfast_entry(fast, addr); full = &desc->fulltlb[tlbfast_index(fast, addr)]; cmp = tlb_read_idx(entry, access_type); @@ -1345,14 +1355,29 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, * called tlb_fill_align, so we know that this entry *is* valid. */ flags &= ~TLB_INVALID_MASK; + goto found_data; + + found_data: + flags &= cmp; + flags |= full->slow_flags[access_type]; + o->flags = flags; + o->full = *full; + o->haddr = (void *)((uintptr_t)addr + entry->addend); goto done; - found: - /* Alignment has not been checked by tlb_fill_align. */ - { + found_code: + o->flags = node->copy.addr_read & TLB_EXEC_FLAGS_MASK; + o->full = node->full; + o->haddr = (void *)((uintptr_t)addr + node->copy.addend); + goto done; + + done: + if (!o->did_tlb_fill) { int a_bits = memop_alignment_bits(memop); /* + * Alignment has not been checked by tlb_fill_align. + * * The TLB_CHECK_ALIGNED check differs from the normal alignment * check, in that this is based on the atomicity of the operation. * The intended use case is the ARM memory type field of each PTE, @@ -1366,13 +1391,6 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o, cpu_unaligned_access(cpu, addr, access_type, i->mmu_idx, i->ra); } } - - done: - flags &= cmp; - flags |= full->slow_flags[access_type]; - o->flags = flags; - o->full = *full; - o->haddr = (void *)((uintptr_t)addr + entry->addend); return true; }
Because translation is special, we don't need the speed of the direct-mapped softmmu tlb. We cache a lookups in DisasContextBase within the translator loop anyway. Drop the addr_code comparator from CPUTLBEntry. Go directly to the IntervalTree for MMU_INST_FETCH. Derive exec flags from read flags. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/exec/cpu-all.h | 3 ++ include/exec/tlb-common.h | 5 ++- accel/tcg/cputlb.c | 76 ++++++++++++++++++++++++--------------- 3 files changed, 52 insertions(+), 32 deletions(-)