diff mbox series

[6/6] accel/tcg: Always require can_do_io

Message ID 20230914174436.1597356-7-richard.henderson@linaro.org
State Superseded
Headers show
Series accel/tcg: Always require can_do_io (#1866) | expand

Commit Message

Richard Henderson Sept. 14, 2023, 5:44 p.m. UTC
Require i/o as the last insn of a TranslationBlock always,
not only with icount.  This is required for i/o that alters
the address space, such as a pci config space write.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1866
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translator.c      | 20 +++++++-------------
 target/mips/tcg/translate.c |  1 -
 2 files changed, 7 insertions(+), 14 deletions(-)

Comments

Philippe Mathieu-Daudé Sept. 19, 2023, 9:37 a.m. UTC | #1
On 14/9/23 19:44, Richard Henderson wrote:
> Require i/o as the last insn of a TranslationBlock always,
> not only with icount.  This is required for i/o that alters
> the address space, such as a pci config space write.
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1866
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   accel/tcg/translator.c      | 20 +++++++-------------
>   target/mips/tcg/translate.c |  1 -
>   2 files changed, 7 insertions(+), 14 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Clément Chigot Oct. 24, 2023, 9:50 a.m. UTC | #2
Hi Richard,

This commit has broken some of our internal bareboard testing on
Risc-V 64. At some point in our programs, there is an AMOSWAP (=
atomic swap) instruction on I/O. But since this commit, can_do_io is
set to false triggering an infinite loop.
IIUC the doc (cf [1]), atomic operations on I/O are allowed.

I think there is a CF_LAST_IO flag missing somewhere to allow it, but
I'm not sure where this should be. Do you have any ideas ?

Sadly I cannot provide a reproducer that easily, mainly because our
microchip has a few patches not yet merged making our binaries not
running on the upstream master.
But here is a bit of the in_asm backtrace:

  | IN: system__bb__riscv_plic__initialize
  | Priv: 3; Virt: 0
  | 0x80000eb4:  1141              addi                    sp,sp,-16
  | 0x80000eb6:  0c0027b7          lui                     a5,49154
  | 0x80000eba:  e406              sd                      ra,8(sp)
  | 0x80000ebc:  00010597          auipc                   a1,16
            # 0x80010ebc
  | 0x80000ec0:  47458593          addi                    a1,a1,1140
  | 0x80000ec4:  f3ffe637          lui                     a2,-49154
  | 0x80000ec8:  01878693          addi                    a3,a5,24
  | 0x80000ecc:  00f58733          add                     a4,a1,a5
  | 0x80000ed0:  9732              add                     a4,a4,a2
  | 0x80000ed2:  4318              lw                      a4,0(a4)
  | 0x80000ed4:  2701              sext.w                  a4,a4
  | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
  | 0x80000eda:  0791              addi                    a5,a5,4
  | 0x80000edc:  fed798e3          bne                     a5,a3,-16
            # 0x80000ecc
  |
  | ----------------
  | IN: system__bb__riscv_plic__initialize
  | Priv: 3; Virt: 0
  | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
  |
  | ----------------
  | IN: system__bb__riscv_plic__initialize
  | Priv: 3; Virt: 0
  | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
  | * Freeze *

a5 (= 0xc002000) above is targeting an address inside sifive_plic HW.


Note IIUC the doc (cf [1]), atomic operations on I/O are allowed.
[1] https://github.com/riscv/riscv-isa-manual/blob/main/src/a-st-ext.adoc#atomic-memory-operations

Thanks in advance.

On Thu, Sep 14, 2023 at 7:45 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Require i/o as the last insn of a TranslationBlock always,
> not only with icount.  This is required for i/o that alters
> the address space, such as a pci config space write.
>
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1866
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  accel/tcg/translator.c      | 20 +++++++-------------
>  target/mips/tcg/translate.c |  1 -
>  2 files changed, 7 insertions(+), 14 deletions(-)
>
> diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
> index dd507cd471..358214d526 100644
> --- a/accel/tcg/translator.c
> +++ b/accel/tcg/translator.c
> @@ -28,12 +28,6 @@ static void set_can_do_io(DisasContextBase *db, bool val)
>
>  bool translator_io_start(DisasContextBase *db)
>  {
> -    uint32_t cflags = tb_cflags(db->tb);
> -
> -    if (!(cflags & CF_USE_ICOUNT)) {
> -        return false;
> -    }
> -
>      set_can_do_io(db, true);
>
>      /*
> @@ -86,15 +80,15 @@ static TCGOp *gen_tb_start(DisasContextBase *db, uint32_t cflags)
>          tcg_gen_st16_i32(count, cpu_env,
>                           offsetof(ArchCPU, neg.icount_decr.u16.low) -
>                           offsetof(ArchCPU, env));
> -        /*
> -         * cpu->can_do_io is set automatically here at the beginning of
> -         * each translation block.  The cost is minimal and only paid for
> -         * -icount, plus it would be very easy to forget doing it in the
> -         * translator.
> -         */
> -        set_can_do_io(db, db->max_insns == 1 && (cflags & CF_LAST_IO));
>      }
>
> +    /*
> +     * cpu->can_do_io is set automatically here at the beginning of
> +     * each translation block.  The cost is minimal, plus it would be
> +     * very easy to forget doing it in the translator.
> +     */
> +    set_can_do_io(db, db->max_insns == 1 && (cflags & CF_LAST_IO));
> +
>      return icount_start_insn;
>  }
>
> diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
> index 9bb40f1849..593fc80458 100644
> --- a/target/mips/tcg/translate.c
> +++ b/target/mips/tcg/translate.c
> @@ -11212,7 +11212,6 @@ static void gen_branch(DisasContext *ctx, int insn_bytes)
>          /* Branches completion */
>          clear_branch_hflags(ctx);
>          ctx->base.is_jmp = DISAS_NORETURN;
> -        /* FIXME: Need to clear can_do_io.  */
>          switch (proc_hflags & MIPS_HFLAG_BMASK_BASE) {
>          case MIPS_HFLAG_FBNSLOT:
>              gen_goto_tb(ctx, 0, ctx->base.pc_next + insn_bytes);
> --
> 2.34.1
>
>
Richard Henderson Oct. 26, 2023, 12:44 a.m. UTC | #3
On 10/24/23 02:50, Clément Chigot wrote:
> Hi Richard,
> 
> This commit has broken some of our internal bareboard testing on
> Risc-V 64. At some point in our programs, there is an AMOSWAP (=
> atomic swap) instruction on I/O. But since this commit, can_do_io is
> set to false triggering an infinite loop.
> IIUC the doc (cf [1]), atomic operations on I/O are allowed.
> 
> I think there is a CF_LAST_IO flag missing somewhere to allow it, but
> I'm not sure where this should be. Do you have any ideas ?
> 
> Sadly I cannot provide a reproducer that easily, mainly because our
> microchip has a few patches not yet merged making our binaries not
> running on the upstream master.
> But here is a bit of the in_asm backtrace:
> 
>    | IN: system__bb__riscv_plic__initialize
>    | Priv: 3; Virt: 0
>    | 0x80000eb4:  1141              addi                    sp,sp,-16
>    | 0x80000eb6:  0c0027b7          lui                     a5,49154
>    | 0x80000eba:  e406              sd                      ra,8(sp)
>    | 0x80000ebc:  00010597          auipc                   a1,16
>              # 0x80010ebc
>    | 0x80000ec0:  47458593          addi                    a1,a1,1140
>    | 0x80000ec4:  f3ffe637          lui                     a2,-49154
>    | 0x80000ec8:  01878693          addi                    a3,a5,24
>    | 0x80000ecc:  00f58733          add                     a4,a1,a5
>    | 0x80000ed0:  9732              add                     a4,a4,a2
>    | 0x80000ed2:  4318              lw                      a4,0(a4)
>    | 0x80000ed4:  2701              sext.w                  a4,a4
>    | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
>    | 0x80000eda:  0791              addi                    a5,a5,4
>    | 0x80000edc:  fed798e3          bne                     a5,a3,-16
>              # 0x80000ecc
>    |
>    | ----------------
>    | IN: system__bb__riscv_plic__initialize
>    | Priv: 3; Virt: 0
>    | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
>    |
>    | ----------------
>    | IN: system__bb__riscv_plic__initialize
>    | Priv: 3; Virt: 0
>    | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
>    | * Freeze *

I would expect two translations:

(1) with the original TB, aborts execution on !can_do_io.
(2) with the second TB, we get further into the actual execution and abort execution on 
TLB_MMIO.
(3) with the third TB, we clear CF_PARALLEL and execute under cpu_exec_step_atomic.

Both 2 and 3 should have had CF_LAST_IO set.
You can verify this with '-d exec' output.

As a trivial example from qemu-system-alpha bios startup:

> Trace 0: 0x7f2584008380 [00000000/fffffc0000003ee4/01000000/ff000000] uart_init_line
> cpu_io_recompile: rewound execution of TB to fffffc0000003ee4
> ----------------
> IN: uart_init_line
> 0xfffffc0000003f20:  stb        t8,0(t6)
> 
> Trace 0: 0x7f2584008a00 [00000000/fffffc0000003f20/01000000/ff018001] uart_init_line

Note that the final "/" field is cflags.  The first "Trace" corresponds to (1), where the 
store is in the middle of the TB.  You can see the io_recompile abort, then the second 
"Trace" contains {CF_COUNT=1, CF_LAST_IO, CF_MEMI_ONLY}.

In the short term, try adding CF_LAST_IO to cflags in cpu_exec_step_atomic.

I think probably the logic of CF_LAST_IO should always be applied now, since can_do_io is 
always live, and thus the flag itself should go away.


r~
Clément Chigot Oct. 26, 2023, 8:10 a.m. UTC | #4
On Thu, Oct 26, 2023 at 2:44 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 10/24/23 02:50, Clément Chigot wrote:
> > Hi Richard,
> >
> > This commit has broken some of our internal bareboard testing on
> > Risc-V 64. At some point in our programs, there is an AMOSWAP (=
> > atomic swap) instruction on I/O. But since this commit, can_do_io is
> > set to false triggering an infinite loop.
> > IIUC the doc (cf [1]), atomic operations on I/O are allowed.
> >
> > I think there is a CF_LAST_IO flag missing somewhere to allow it, but
> > I'm not sure where this should be. Do you have any ideas ?
> >
> > Sadly I cannot provide a reproducer that easily, mainly because our
> > microchip has a few patches not yet merged making our binaries not
> > running on the upstream master.
> > But here is a bit of the in_asm backtrace:
> >
> >    | IN: system__bb__riscv_plic__initialize
> >    | Priv: 3; Virt: 0
> >    | 0x80000eb4:  1141              addi                    sp,sp,-16
> >    | 0x80000eb6:  0c0027b7          lui                     a5,49154
> >    | 0x80000eba:  e406              sd                      ra,8(sp)
> >    | 0x80000ebc:  00010597          auipc                   a1,16
> >              # 0x80010ebc
> >    | 0x80000ec0:  47458593          addi                    a1,a1,1140
> >    | 0x80000ec4:  f3ffe637          lui                     a2,-49154
> >    | 0x80000ec8:  01878693          addi                    a3,a5,24
> >    | 0x80000ecc:  00f58733          add                     a4,a1,a5
> >    | 0x80000ed0:  9732              add                     a4,a4,a2
> >    | 0x80000ed2:  4318              lw                      a4,0(a4)
> >    | 0x80000ed4:  2701              sext.w                  a4,a4
> >    | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
> >    | 0x80000eda:  0791              addi                    a5,a5,4
> >    | 0x80000edc:  fed798e3          bne                     a5,a3,-16
> >              # 0x80000ecc
> >    |
> >    | ----------------
> >    | IN: system__bb__riscv_plic__initialize
> >    | Priv: 3; Virt: 0
> >    | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
> >    |
> >    | ----------------
> >    | IN: system__bb__riscv_plic__initialize
> >    | Priv: 3; Virt: 0
> >    | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
> >    | * Freeze *
>
> I would expect two translations:
>
> (1) with the original TB, aborts execution on !can_do_io.
> (2) with the second TB, we get further into the actual execution and abort execution on
> TLB_MMIO.
> (3) with the third TB, we clear CF_PARALLEL and execute under cpu_exec_step_atomic.
>
> Both 2 and 3 should have had CF_LAST_IO set.
> You can verify this with '-d exec' output.
>
> As a trivial example from qemu-system-alpha bios startup:
>
> > Trace 0: 0x7f2584008380 [00000000/fffffc0000003ee4/01000000/ff000000] uart_init_line
> > cpu_io_recompile: rewound execution of TB to fffffc0000003ee4
> > ----------------
> > IN: uart_init_line
> > 0xfffffc0000003f20:  stb        t8,0(t6)
> >
> > Trace 0: 0x7f2584008a00 [00000000/fffffc0000003f20/01000000/ff018001] uart_init_line
>
> Note that the final "/" field is cflags.  The first "Trace" corresponds to (1), where the
> store is in the middle of the TB.  You can see the io_recompile abort, then the second
> "Trace" contains {CF_COUNT=1, CF_LAST_IO, CF_MEMI_ONLY}.

With the exec, it's indeed a bit clearer.
I do have a new translation made by cpu_io_recompile which sets
CF_LAST_IO (2). But afterward, it somehow loops back to the previous
translation which doesn't have CF_LAST_IO which calls cpu_io_recompile
again, etc. This triggers an infinite loop between these two
translations
  | ----------------
  | IN: system__bb__riscv_plic__initialize
  | 0x80000eb4:  1141              addi                    sp,sp,-16
  | ...
  | 0x80000ed4:  2701              sext.w                  a4,a4
  | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
  | 0x80000eda:  0791              addi                    a5,a5,4
  | 0x80000edc:  fed798e3          bne                     a5,a3,-16
            # 0x80000ecc
  |
  | Linking TBs 0x7f0d3199db40 index 0 -> 0x7f0d3199dd00
  | Trace 1: 0x7f0d3199dd00
[00000000/0000000080000eb4/0b02401b/ff280000]
system__bb__riscv_plic__initialize

(1) First exec of amoswap without CF_LAST_IO.
  | ----------------
  | IN: system__bb__riscv_plic__initialize
  | Priv: 3; Virt: 0
  | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
  |
  | Trace 1: 0x7f0d3199df80
[00000000/0000000080000ed6/0b02401b/ff200601]
system__bb__riscv_plic__initialize

(2) Second exec with CF_LAST_IO.
  | cpu_io_recompile: rewound execution of TB to 0000000080000ed6
  | ----------------
  | IN: system__bb__riscv_plic__initialize
  | Priv: 3; Virt: 0
  | 0x80000ed6:  08e7a02f          amoswap.w               zero,a4,(a5)
  |
  | Trace 1: 0x7f0d3199e140
[00000000/0000000080000ed6/0b02401b/ff298001]
system__bb__riscv_plic__initialize

Loop back (1) and start the infinite loop between (1) and (2).
  | Trace 1: 0x7f0d3199df80
[00000000/0000000080000ed6/0b02401b/ff200601]
system__bb__riscv_plic__initialize
  | cpu_io_recompile: rewound execution of TB to 0000000080000ed6
  | Trace 1: 0x7f0d3199e140
[00000000/0000000080000ed6/0b02401b/ff298001]
system__bb__riscv_plic__initialize
  | Trace 1: 0x7f0d3199df80
[00000000/0000000080000ed6/0b02401b/ff200601]
system__bb__riscv_plic__initialize
  | cpu_io_recompile: rewound execution of TB to 0000000080000ed6

I'll continue to investigate. Even if the workaround you provided
works, there is something weird happening here.

> In the short term, try adding CF_LAST_IO to cflags in cpu_exec_step_atomic.
Thanks for that workaround.

Thanks,
Clément

> I think probably the logic of CF_LAST_IO should always be applied now, since can_do_io is
> always live, and thus the flag itself should go away.
>
>
> r~
diff mbox series

Patch

diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index dd507cd471..358214d526 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -28,12 +28,6 @@  static void set_can_do_io(DisasContextBase *db, bool val)
 
 bool translator_io_start(DisasContextBase *db)
 {
-    uint32_t cflags = tb_cflags(db->tb);
-
-    if (!(cflags & CF_USE_ICOUNT)) {
-        return false;
-    }
-
     set_can_do_io(db, true);
 
     /*
@@ -86,15 +80,15 @@  static TCGOp *gen_tb_start(DisasContextBase *db, uint32_t cflags)
         tcg_gen_st16_i32(count, cpu_env,
                          offsetof(ArchCPU, neg.icount_decr.u16.low) -
                          offsetof(ArchCPU, env));
-        /*
-         * cpu->can_do_io is set automatically here at the beginning of
-         * each translation block.  The cost is minimal and only paid for
-         * -icount, plus it would be very easy to forget doing it in the
-         * translator.
-         */
-        set_can_do_io(db, db->max_insns == 1 && (cflags & CF_LAST_IO));
     }
 
+    /*
+     * cpu->can_do_io is set automatically here at the beginning of
+     * each translation block.  The cost is minimal, plus it would be
+     * very easy to forget doing it in the translator.
+     */
+    set_can_do_io(db, db->max_insns == 1 && (cflags & CF_LAST_IO));
+
     return icount_start_insn;
 }
 
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index 9bb40f1849..593fc80458 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -11212,7 +11212,6 @@  static void gen_branch(DisasContext *ctx, int insn_bytes)
         /* Branches completion */
         clear_branch_hflags(ctx);
         ctx->base.is_jmp = DISAS_NORETURN;
-        /* FIXME: Need to clear can_do_io.  */
         switch (proc_hflags & MIPS_HFLAG_BMASK_BASE) {
         case MIPS_HFLAG_FBNSLOT:
             gen_goto_tb(ctx, 0, ctx->base.pc_next + insn_bytes);