Message ID | 20220804092044.2101093-1-alex.bennee@linaro.org |
---|---|
State | New |
Headers | show |
Series | [RFC] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!) | expand |
Hello Alex, Thanks for putting some time into this problem, On 8/4/22 11:20, Alex Bennée wrote: > Investigating why some BMC models are so slow compared to a plain ARM > virt machines I did some profiling of: > > ./qemu-system-arm -M romulus-bmc -nic user \ > -drive > file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \ > -nographic -serial mon:stdio > > And saw that object_dynamic_cast was dominating the profile times. We > have a number of cases in the CPU hot path and more importantly for > this model in the SSI bus. As the class is static once the object is > created we just cache it and use it instead of the dynamic case > macros. > > [AJB: I suspect a proper fix for this is for QOM to support a cached > class lookup, abortive macro attempt #if 0'd in this patch]. > > Signed-off-by: Alex Bennée <alex.bennee@linaro.org> > Cc: Cédric Le Goater <clg@kaod.org> Here are some results, * romulus-bmc, OpenBmc login prompt without : 82s with : 56s * ast2500-evb,execute-in-place=true, U-boot 2019.04 prompt without : 30s with : 22s * witherspoon-bmc,execute-in-place=true, U-boot 2016.07 prompt without : 5.5s with : 4.1s There is definitely an improvement in all scenarios. Applying a similar technique on AspeedSMCClass, I could gain another ~10% and boot the ast2500-evb,execute-in-place=true machine, up to the U-boot 2019.04 prompt, in less then 20s. However, newer u-boot are still quite slow to boot when executing from the flash device. Thanks, C. > --- > include/hw/core/cpu.h | 2 ++ > include/hw/ssi/ssi.h | 3 +++ > include/qom/object.h | 29 +++++++++++++++++++++++++++++ > accel/tcg/cputlb.c | 12 ++++++++---- > hw/ssi/ssi.c | 10 +++++++--- > 5 files changed, 49 insertions(+), 7 deletions(-) > > diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h > index 500503da13..70027a772e 100644 > --- a/include/hw/core/cpu.h > +++ b/include/hw/core/cpu.h > @@ -317,6 +317,8 @@ struct qemu_work_item; > struct CPUState { > /*< private >*/ > DeviceState parent_obj; > + /* cache to avoid expensive CPU_GET_CLASS */ > + CPUClass *cc; > /*< public >*/ > > int nr_cores; > diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h > index f411858ab0..6950f86810 100644 > --- a/include/hw/ssi/ssi.h > +++ b/include/hw/ssi/ssi.h > @@ -59,6 +59,9 @@ struct SSIPeripheralClass { > struct SSIPeripheral { > DeviceState parent_obj; > > + /* cache the class */ > + SSIPeripheralClass *spc; > + > /* Chip select state */ > bool cs; > }; > diff --git a/include/qom/object.h b/include/qom/object.h > index ef7258a5e1..2202dbfa43 100644 > --- a/include/qom/object.h > +++ b/include/qom/object.h > @@ -198,6 +198,35 @@ struct Object > OBJ_NAME##_CLASS(const void *klass) \ > { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); } > > +#if 0 > +/** > + * DECLARE_CACHED_CLASS_CHECKER: > + * @InstanceType: instance struct name > + * @ClassType: class struct name > + * @OBJ_NAME: the object name in uppercase with underscore separators > + * @TYPENAME: type name > + * > + * This variant of DECLARE_CLASS_CHECKERS allows for the caching of > + * class in the parent object instance. This is useful for very hot > + * path code at the expense of an extra indirection and check. As per > + * the original direct usage of this macro should be avoided if the > + * complete OBJECT_DECLARE_TYPE macro has been used. > + * > + * This macro will provide the class type cast functions for a > + * QOM type. > + */ > +#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME, TYPENAME) \ > + DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \ > + static inline G_GNUC_UNUSED ClassType * \ > + OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \ > + { \ > + InstanceType *p = (InstanceType *) obj; \ > + p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\ > + return p->cc; \ > + } > + > +#endif > + > /** > * DECLARE_OBJ_CHECKERS: > * @InstanceType: instance struct name > diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c > index a46f3a654d..882315f7dd 100644 > --- a/accel/tcg/cputlb.c > +++ b/accel/tcg/cputlb.c > @@ -1303,8 +1303,9 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) > static void tlb_fill(CPUState *cpu, target_ulong addr, int size, > MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) > { > - CPUClass *cc = CPU_GET_CLASS(cpu); > + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); > bool ok; > + cpu->cc = cc; > > /* > * This is not a probe, so only valid return is success; failure > @@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr, > MMUAccessType access_type, > int mmu_idx, uintptr_t retaddr) > { > - CPUClass *cc = CPU_GET_CLASS(cpu); > + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); > + cpu->cc = cc; > > cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr); > } > @@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr, > MemTxResult response, > uintptr_t retaddr) > { > - CPUClass *cc = CPU_GET_CLASS(cpu); > + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); > + cpu->cc = cc; > > if (!cpu->ignore_memory_transaction_failures && > cc->tcg_ops->do_transaction_failed) { > @@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env, target_ulong addr, > if (!tlb_hit_page(tlb_addr, page_addr)) { > if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) { > CPUState *cs = env_cpu(env); > - CPUClass *cc = CPU_GET_CLASS(cs); > + CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs); > + cs->cc = cc; > > if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type, > mmu_idx, nonfault, retaddr)) { > diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c > index 003931fb50..f749feb6e3 100644 > --- a/hw/ssi/ssi.c > +++ b/hw/ssi/ssi.c > @@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level) > bool cs = !!level; > assert(n == 0); > if (s->cs != cs) { > - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); > + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */ > + SSIPeripheralClass *ssc = s->spc; > if (ssc->set_cs) { > ssc->set_cs(s, cs); > } > @@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level) > > static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev, uint32_t val) > { > - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); > + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */ > + SSIPeripheralClass *ssc = dev->spc; > > if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) || > (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) || > @@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error **errp) > ssc->cs_polarity != SSI_CS_NONE) { > qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1); > } > + s->spc = ssc; > > ssc->realize(s, errp); > } > @@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val) > > QTAILQ_FOREACH(kid, &b->children, sibling) { > SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child); > - ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); > + /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */ > + ssc = peripheral->spc; > r |= ssc->transfer_raw(peripheral, val); > } >
Cédric Le Goater <clg@kaod.org> writes: > Hello Alex, > > Thanks for putting some time into this problem, > > On 8/4/22 11:20, Alex Bennée wrote: >> Investigating why some BMC models are so slow compared to a plain ARM >> virt machines I did some profiling of: >> ./qemu-system-arm -M romulus-bmc -nic user \ >> -drive >> file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \ >> -nographic -serial mon:stdio >> And saw that object_dynamic_cast was dominating the profile times. >> We >> have a number of cases in the CPU hot path and more importantly for >> this model in the SSI bus. As the class is static once the object is >> created we just cache it and use it instead of the dynamic case >> macros. >> [AJB: I suspect a proper fix for this is for QOM to support a cached >> class lookup, abortive macro attempt #if 0'd in this patch]. >> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >> Cc: Cédric Le Goater <clg@kaod.org> > > > Here are some results, > > * romulus-bmc, OpenBmc login prompt > > without : 82s > with : 56s Looks like I lucked out picking the lowest hanging fruit. > > * ast2500-evb,execute-in-place=true, U-boot 2019.04 prompt > > without : 30s > with : 22s > > * witherspoon-bmc,execute-in-place=true, U-boot 2016.07 prompt > > without : 5.5s > with : 4.1s > > There is definitely an improvement in all scenarios. > > Applying a similar technique on AspeedSMCClass, I could gain > another ~10% and boot the ast2500-evb,execute-in-place=true > machine, up to the U-boot 2019.04 prompt, in less then 20s. There are some fundamentals to XIP which means they will be slower if each instruction is being sucked through io_readx/device emulation although I'm not sure what the exact mechanism is because surely a ROM can just be mapped into the address space and run from there? > However, newer u-boot are still quite slow to boot when executing > from the flash device. For any of those machines? Whats the next command line for me to dig into? > > Thanks, > > C. > >> --- >> include/hw/core/cpu.h | 2 ++ >> include/hw/ssi/ssi.h | 3 +++ >> include/qom/object.h | 29 +++++++++++++++++++++++++++++ >> accel/tcg/cputlb.c | 12 ++++++++---- >> hw/ssi/ssi.c | 10 +++++++--- >> 5 files changed, 49 insertions(+), 7 deletions(-) >> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h >> index 500503da13..70027a772e 100644 >> --- a/include/hw/core/cpu.h >> +++ b/include/hw/core/cpu.h >> @@ -317,6 +317,8 @@ struct qemu_work_item; >> struct CPUState { >> /*< private >*/ >> DeviceState parent_obj; >> + /* cache to avoid expensive CPU_GET_CLASS */ >> + CPUClass *cc; >> /*< public >*/ >> int nr_cores; >> diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h >> index f411858ab0..6950f86810 100644 >> --- a/include/hw/ssi/ssi.h >> +++ b/include/hw/ssi/ssi.h >> @@ -59,6 +59,9 @@ struct SSIPeripheralClass { >> struct SSIPeripheral { >> DeviceState parent_obj; >> + /* cache the class */ >> + SSIPeripheralClass *spc; >> + >> /* Chip select state */ >> bool cs; >> }; >> diff --git a/include/qom/object.h b/include/qom/object.h >> index ef7258a5e1..2202dbfa43 100644 >> --- a/include/qom/object.h >> +++ b/include/qom/object.h >> @@ -198,6 +198,35 @@ struct Object >> OBJ_NAME##_CLASS(const void *klass) \ >> { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); } >> +#if 0 >> +/** >> + * DECLARE_CACHED_CLASS_CHECKER: >> + * @InstanceType: instance struct name >> + * @ClassType: class struct name >> + * @OBJ_NAME: the object name in uppercase with underscore separators >> + * @TYPENAME: type name >> + * >> + * This variant of DECLARE_CLASS_CHECKERS allows for the caching of >> + * class in the parent object instance. This is useful for very hot >> + * path code at the expense of an extra indirection and check. As per >> + * the original direct usage of this macro should be avoided if the >> + * complete OBJECT_DECLARE_TYPE macro has been used. >> + * >> + * This macro will provide the class type cast functions for a >> + * QOM type. >> + */ >> +#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME, TYPENAME) \ >> + DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \ >> + static inline G_GNUC_UNUSED ClassType * \ >> + OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \ >> + { \ >> + InstanceType *p = (InstanceType *) obj; \ >> + p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\ >> + return p->cc; \ >> + } >> + >> +#endif >> + >> /** >> * DECLARE_OBJ_CHECKERS: >> * @InstanceType: instance struct name >> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c >> index a46f3a654d..882315f7dd 100644 >> --- a/accel/tcg/cputlb.c >> +++ b/accel/tcg/cputlb.c >> @@ -1303,8 +1303,9 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) >> static void tlb_fill(CPUState *cpu, target_ulong addr, int size, >> MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) >> { >> - CPUClass *cc = CPU_GET_CLASS(cpu); >> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); >> bool ok; >> + cpu->cc = cc; >> /* >> * This is not a probe, so only valid return is success; failure >> @@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr, >> MMUAccessType access_type, >> int mmu_idx, uintptr_t retaddr) >> { >> - CPUClass *cc = CPU_GET_CLASS(cpu); >> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); >> + cpu->cc = cc; >> cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, >> mmu_idx, retaddr); >> } >> @@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr, >> MemTxResult response, >> uintptr_t retaddr) >> { >> - CPUClass *cc = CPU_GET_CLASS(cpu); >> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); >> + cpu->cc = cc; >> if (!cpu->ignore_memory_transaction_failures && >> cc->tcg_ops->do_transaction_failed) { >> @@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env, target_ulong addr, >> if (!tlb_hit_page(tlb_addr, page_addr)) { >> if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) { >> CPUState *cs = env_cpu(env); >> - CPUClass *cc = CPU_GET_CLASS(cs); >> + CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs); >> + cs->cc = cc; >> if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, >> access_type, >> mmu_idx, nonfault, retaddr)) { >> diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c >> index 003931fb50..f749feb6e3 100644 >> --- a/hw/ssi/ssi.c >> +++ b/hw/ssi/ssi.c >> @@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level) >> bool cs = !!level; >> assert(n == 0); >> if (s->cs != cs) { >> - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); >> + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */ >> + SSIPeripheralClass *ssc = s->spc; >> if (ssc->set_cs) { >> ssc->set_cs(s, cs); >> } >> @@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level) >> static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev, >> uint32_t val) >> { >> - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); >> + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */ >> + SSIPeripheralClass *ssc = dev->spc; >> if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) || >> (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) || >> @@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error **errp) >> ssc->cs_polarity != SSI_CS_NONE) { >> qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1); >> } >> + s->spc = ssc; >> ssc->realize(s, errp); >> } >> @@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val) >> QTAILQ_FOREACH(kid, &b->children, sibling) { >> SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child); >> - ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); >> + /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */ >> + ssc = peripheral->spc; >> r |= ssc->transfer_raw(peripheral, val); >> } >>
On 8/4/22 18:51, Alex Bennée wrote: > > Cédric Le Goater <clg@kaod.org> writes: > >> Hello Alex, >> >> Thanks for putting some time into this problem, >> >> On 8/4/22 11:20, Alex Bennée wrote: >>> Investigating why some BMC models are so slow compared to a plain ARM >>> virt machines I did some profiling of: >>> ./qemu-system-arm -M romulus-bmc -nic user \ >>> -drive >>> file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \ >>> -nographic -serial mon:stdio >>> And saw that object_dynamic_cast was dominating the profile times. >>> We >>> have a number of cases in the CPU hot path and more importantly for >>> this model in the SSI bus. As the class is static once the object is >>> created we just cache it and use it instead of the dynamic case >>> macros. >>> [AJB: I suspect a proper fix for this is for QOM to support a cached >>> class lookup, abortive macro attempt #if 0'd in this patch]. >>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org> >>> Cc: Cédric Le Goater <clg@kaod.org> >> >> >> Here are some results, >> >> * romulus-bmc, OpenBmc login prompt >> >> without : 82s >> with : 56s > > Looks like I lucked out picking the lowest hanging fruit. That's a huge improvement. I tend to use buildroot mostly for FW and kernel dev but OpenBMC has become as complex as a common server distro. The above result is probably faster than real HW, for the AST2400 and AST2500 at least. >> >> * ast2500-evb,execute-in-place=true, U-boot 2019.04 prompt >> >> without : 30s >> with : 22s >> >> * witherspoon-bmc,execute-in-place=true, U-boot 2016.07 prompt >> >> without : 5.5s >> with : 4.1s >> >> There is definitely an improvement in all scenarios. >> >> Applying a similar technique on AspeedSMCClass, I could gain >> another ~10% and boot the ast2500-evb,execute-in-place=true >> machine, up to the U-boot 2019.04 prompt, in less then 20s. > > There are some fundamentals to XIP which means they will be slower if > each instruction is being sucked through io_readx/device emulation Yes. But when using XIP, there is a huge time difference between two U-boot versions. See above. It takes 4s to reach the U-boot prompt of the older 2016.07 and 22s on the newer U-boot 2019.04. > although I'm not sure what the exact mechanism is because surely a ROM > can just be mapped into the address space and run from there? It can and that's the default QEMU mode for the Aspeed machines. The flash contents is copied in a ROM at 0x0. See commit 1a15311a12fa ("hw/arm/aspeed: add a 'execute-in-place' property to boot directly from CE0") That's not exactly how the HW works and there are still some FW (like uboot on the AST2600 BMC of some Meta boards) which will fetch instructions to execute from the flash contents region at 0x20000000 and not use the ROM region copied at 0x0. >> However, newer u-boot are still quite slow to boot when executing >> from the flash device. > > For any of those machines? Yes. It gets worse with the AST2600, which has 2 CPUs > Whats the next command line for me to dig into? Here are images to reproduce. * U-Boot 2016.07: wget https://github.com/openbmc/openbmc/releases/download/2.9.0/obmc-phosphor-image-romulus.static.mtd qemu-system-arm -M romulus-bmc -drive file=./obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd -nographic qemu-system-arm -M romulus-bmc,execute-in-place=true -drive file=./obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd -nographic * U-Boot 2019.04: wget https://www.kaod.org/qemu/aspeed/romulus/flash-romulus-bmc same commands Thanks, C.
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 500503da13..70027a772e 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -317,6 +317,8 @@ struct qemu_work_item; struct CPUState { /*< private >*/ DeviceState parent_obj; + /* cache to avoid expensive CPU_GET_CLASS */ + CPUClass *cc; /*< public >*/ int nr_cores; diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h index f411858ab0..6950f86810 100644 --- a/include/hw/ssi/ssi.h +++ b/include/hw/ssi/ssi.h @@ -59,6 +59,9 @@ struct SSIPeripheralClass { struct SSIPeripheral { DeviceState parent_obj; + /* cache the class */ + SSIPeripheralClass *spc; + /* Chip select state */ bool cs; }; diff --git a/include/qom/object.h b/include/qom/object.h index ef7258a5e1..2202dbfa43 100644 --- a/include/qom/object.h +++ b/include/qom/object.h @@ -198,6 +198,35 @@ struct Object OBJ_NAME##_CLASS(const void *klass) \ { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); } +#if 0 +/** + * DECLARE_CACHED_CLASS_CHECKER: + * @InstanceType: instance struct name + * @ClassType: class struct name + * @OBJ_NAME: the object name in uppercase with underscore separators + * @TYPENAME: type name + * + * This variant of DECLARE_CLASS_CHECKERS allows for the caching of + * class in the parent object instance. This is useful for very hot + * path code at the expense of an extra indirection and check. As per + * the original direct usage of this macro should be avoided if the + * complete OBJECT_DECLARE_TYPE macro has been used. + * + * This macro will provide the class type cast functions for a + * QOM type. + */ +#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME, TYPENAME) \ + DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \ + static inline G_GNUC_UNUSED ClassType * \ + OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \ + { \ + InstanceType *p = (InstanceType *) obj; \ + p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\ + return p->cc; \ + } + +#endif + /** * DECLARE_OBJ_CHECKERS: * @InstanceType: instance struct name diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index a46f3a654d..882315f7dd 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -1303,8 +1303,9 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) static void tlb_fill(CPUState *cpu, target_ulong addr, int size, MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) { - CPUClass *cc = CPU_GET_CLASS(cpu); + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); bool ok; + cpu->cc = cc; /* * This is not a probe, so only valid return is success; failure @@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr, MMUAccessType access_type, int mmu_idx, uintptr_t retaddr) { - CPUClass *cc = CPU_GET_CLASS(cpu); + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); + cpu->cc = cc; cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr); } @@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr, MemTxResult response, uintptr_t retaddr) { - CPUClass *cc = CPU_GET_CLASS(cpu); + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu); + cpu->cc = cc; if (!cpu->ignore_memory_transaction_failures && cc->tcg_ops->do_transaction_failed) { @@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env, target_ulong addr, if (!tlb_hit_page(tlb_addr, page_addr)) { if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) { CPUState *cs = env_cpu(env); - CPUClass *cc = CPU_GET_CLASS(cs); + CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs); + cs->cc = cc; if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type, mmu_idx, nonfault, retaddr)) { diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c index 003931fb50..f749feb6e3 100644 --- a/hw/ssi/ssi.c +++ b/hw/ssi/ssi.c @@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level) bool cs = !!level; assert(n == 0); if (s->cs != cs) { - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */ + SSIPeripheralClass *ssc = s->spc; if (ssc->set_cs) { ssc->set_cs(s, cs); } @@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level) static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev, uint32_t val) { - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */ + SSIPeripheralClass *ssc = dev->spc; if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) || (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) || @@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error **errp) ssc->cs_polarity != SSI_CS_NONE) { qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1); } + s->spc = ssc; ssc->realize(s, errp); } @@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val) QTAILQ_FOREACH(kid, &b->children, sibling) { SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child); - ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); + /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */ + ssc = peripheral->spc; r |= ssc->transfer_raw(peripheral, val); }
Investigating why some BMC models are so slow compared to a plain ARM virt machines I did some profiling of: ./qemu-system-arm -M romulus-bmc -nic user \ -drive file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \ -nographic -serial mon:stdio And saw that object_dynamic_cast was dominating the profile times. We have a number of cases in the CPU hot path and more importantly for this model in the SSI bus. As the class is static once the object is created we just cache it and use it instead of the dynamic case macros. [AJB: I suspect a proper fix for this is for QOM to support a cached class lookup, abortive macro attempt #if 0'd in this patch]. Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Cc: Cédric Le Goater <clg@kaod.org> --- include/hw/core/cpu.h | 2 ++ include/hw/ssi/ssi.h | 3 +++ include/qom/object.h | 29 +++++++++++++++++++++++++++++ accel/tcg/cputlb.c | 12 ++++++++---- hw/ssi/ssi.c | 10 +++++++--- 5 files changed, 49 insertions(+), 7 deletions(-)