diff mbox series

[v2,63/93] tcg/tci: Use ffi for calls

Message ID 20210204014509.882821-64-richard.henderson@linaro.org
State Superseded
Headers show
Series TCI fixes and cleanups | expand

Commit Message

Richard Henderson Feb. 4, 2021, 1:44 a.m. UTC
This requires adjusting where arguments are stored.
Place them on the stack at left-aligned positions.
Adjust the stack frame to be at entirely positive offsets.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 include/tcg/tcg.h        |   1 +
 tcg/tci/tcg-target.h     |   2 +-
 tcg/tcg.c                |  72 ++++++++++++---------
 tcg/tci.c                | 131 ++++++++++++++++++++++-----------------
 tcg/tci/tcg-target.c.inc |  50 +++++++--------
 5 files changed, 143 insertions(+), 113 deletions(-)

-- 
2.25.1

Comments

Stefan Weil Feb. 7, 2021, 4:25 p.m. UTC | #1
Am 04.02.21 um 02:44 schrieb Richard Henderson:

> This requires adjusting where arguments are stored.

> Place them on the stack at left-aligned positions.

> Adjust the stack frame to be at entirely positive offsets.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

[...]
> diff --git a/tcg/tci.c b/tcg/tci.c

> index 6843e837ae..d27db9f720 100644

> --- a/tcg/tci.c

> +++ b/tcg/tci.c

> @@ -18,6 +18,13 @@

>    */

>   

>   #include "qemu/osdep.h"

> +#include "qemu-common.h"

> +#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */

> +#include "exec/cpu_ldst.h"

> +#include "tcg/tcg-op.h"

> +#include "qemu/compiler.h"

> +#include <ffi.h>

> +



ffi.h is not found on macOS with Homebrew.

This can be fixed by using pkg-config to find the right compiler (and 
maybe also linker) flags:

% pkg-config --cflags libffi
-I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi
% pkg-config --libs libffi
-lffi

Regards,

Stefan
Richard Henderson Feb. 7, 2021, 5:39 p.m. UTC | #2
On 2/7/21 8:25 AM, Stefan Weil wrote:
>> +#include "qemu-common.h"

>> +#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */

>> +#include "exec/cpu_ldst.h"

>> +#include "tcg/tcg-op.h"

>> +#include "qemu/compiler.h"

>> +#include <ffi.h>

>> +

> 

> 

> ffi.h is not found on macOS with Homebrew.

> 

> This can be fixed by using pkg-config to find the right compiler (and maybe

> also linker) flags:

> 

> % pkg-config --cflags libffi

> -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

> % pkg-config --libs libffi

> -lffi



Which is exactly what I do in the previous patch:


> +++ b/meson.build

> @@ -1901,7 +1901,14 @@ specific_ss.add(when: 'CONFIG_TCG', if_true: files(

>    'tcg/tcg-op.c',

>    'tcg/tcg.c',

>  ))

> -specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))

> +

> +if get_option('tcg_interpreter')

> +  libffi = dependency('libffi', version: '>=3.0',

> +                      static: enable_static, method: 'pkg-config',

> +                      required: true)

> +  specific_ss.add(libffi)

> +  specific_ss.add(files('tcg/tci.c'))

> +endif


Did you need a PKG_CONFIG_LIBDIR set for homebrew?


r~
Peter Maydell Feb. 7, 2021, 7:52 p.m. UTC | #3
On Sun, 7 Feb 2021 at 17:41, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 2/7/21 8:25 AM, Stefan Weil wrote:

> >> +#include "qemu-common.h"

> >> +#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */

> >> +#include "exec/cpu_ldst.h"

> >> +#include "tcg/tcg-op.h"

> >> +#include "qemu/compiler.h"

> >> +#include <ffi.h>

> >> +

> >

> >

> > ffi.h is not found on macOS with Homebrew.

> >

> > This can be fixed by using pkg-config to find the right compiler (and maybe

> > also linker) flags:

> >

> > % pkg-config --cflags libffi

> > -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

> > % pkg-config --libs libffi

> > -lffi

>

>

> Which is exactly what I do in the previous patch:

>

>

> > +++ b/meson.build

> > @@ -1901,7 +1901,14 @@ specific_ss.add(when: 'CONFIG_TCG', if_true: files(

> >    'tcg/tcg-op.c',

> >    'tcg/tcg.c',

> >  ))

> > -specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))

> > +

> > +if get_option('tcg_interpreter')

> > +  libffi = dependency('libffi', version: '>=3.0',

> > +                      static: enable_static, method: 'pkg-config',

> > +                      required: true)

> > +  specific_ss.add(libffi)

> > +  specific_ss.add(files('tcg/tci.c'))

> > +endif

>

> Did you need a PKG_CONFIG_LIBDIR set for homebrew?


Is this the "meson doesn't actually add the cflags everywhere it should"
bug again ?

thanks
-- PMM
Richard Henderson Feb. 7, 2021, 8:12 p.m. UTC | #4
On 2/7/21 11:52 AM, Peter Maydell wrote:
> On Sun, 7 Feb 2021 at 17:41, Richard Henderson

> <richard.henderson@linaro.org> wrote:

>>

>> On 2/7/21 8:25 AM, Stefan Weil wrote:

>>>> +#include "qemu-common.h"

>>>> +#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */

>>>> +#include "exec/cpu_ldst.h"

>>>> +#include "tcg/tcg-op.h"

>>>> +#include "qemu/compiler.h"

>>>> +#include <ffi.h>

>>>> +

>>>

>>>

>>> ffi.h is not found on macOS with Homebrew.

>>>

>>> This can be fixed by using pkg-config to find the right compiler (and maybe

>>> also linker) flags:

>>>

>>> % pkg-config --cflags libffi

>>> -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

>>> % pkg-config --libs libffi

>>> -lffi

>>

>>

>> Which is exactly what I do in the previous patch:

>>

>>

>>> +++ b/meson.build

>>> @@ -1901,7 +1901,14 @@ specific_ss.add(when: 'CONFIG_TCG', if_true: files(

>>>    'tcg/tcg-op.c',

>>>    'tcg/tcg.c',

>>>  ))

>>> -specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))

>>> +

>>> +if get_option('tcg_interpreter')

>>> +  libffi = dependency('libffi', version: '>=3.0',

>>> +                      static: enable_static, method: 'pkg-config',

>>> +                      required: true)

>>> +  specific_ss.add(libffi)

>>> +  specific_ss.add(files('tcg/tci.c'))

>>> +endif

>>

>> Did you need a PKG_CONFIG_LIBDIR set for homebrew?

> 

> Is this the "meson doesn't actually add the cflags everywhere it should"

> bug again ?


I guess so.  I realized after sending this reply that PKG_CONFIG_LIBDIR can't
be the answer, since the original configure should have failed if pkg-config
didn't find ffi.

Was there a resolution to said meson bug?


r~
Stefan Weil Feb. 7, 2021, 9:33 p.m. UTC | #5
On 07.02.21 21:12, Richard Henderson wrote:
> On 2/7/21 11:52 AM, Peter Maydell wrote:

>> On Sun, 7 Feb 2021 at 17:41, Richard Henderson

>> <richard.henderson@linaro.org> wrote:

>>>

>>> On 2/7/21 8:25 AM, Stefan Weil wrote:

>>>>> +#include "qemu-common.h"

>>>>> +#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */

>>>>> +#include "exec/cpu_ldst.h"

>>>>> +#include "tcg/tcg-op.h"

>>>>> +#include "qemu/compiler.h"

>>>>> +#include <ffi.h>

>>>>> +

>>>>

>>>>

>>>> ffi.h is not found on macOS with Homebrew.

>>>>

>>>> This can be fixed by using pkg-config to find the right compiler (and maybe

>>>> also linker) flags:

>>>>

>>>> % pkg-config --cflags libffi

>>>> -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

>>>> % pkg-config --libs libffi

>>>> -lffi

>>>

>>>

>>> Which is exactly what I do in the previous patch:

>>>

>>>

>>>> +++ b/meson.build

>>>> @@ -1901,7 +1901,14 @@ specific_ss.add(when: 'CONFIG_TCG', if_true: files(

>>>>    'tcg/tcg-op.c',

>>>>    'tcg/tcg.c',

>>>>  ))

>>>> -specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))

>>>> +

>>>> +if get_option('tcg_interpreter')

>>>> +  libffi = dependency('libffi', version: '>=3.0',

>>>> +                      static: enable_static, method: 'pkg-config',

>>>> +                      required: true)

>>>> +  specific_ss.add(libffi)

>>>> +  specific_ss.add(files('tcg/tci.c'))

>>>> +endif

>>>

>>> Did you need a PKG_CONFIG_LIBDIR set for homebrew?

>>

>> Is this the "meson doesn't actually add the cflags everywhere it should"

>> bug again ?

> 

> I guess so.  I realized after sending this reply that PKG_CONFIG_LIBDIR can't

> be the answer, since the original configure should have failed if pkg-config

> didn't find ffi.

> 

> Was there a resolution to said meson bug?


Meanwhile I noticed an additional detail:

There exist two different pkg-config configurations for libffi on Homebrew:

% pkg-config --cflags libffi
-I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi
% export PKG_CONFIG_PATH="/opt/homebrew/opt/libffi/lib/pkgconfig"
% pkg-config --cflags libffi
-I/opt/homebrew/Cellar/libffi/3.3_2/include

By default it points to a system directory which does not exist at all
on my Mac, so that will never work.

With the right PKG_CONFIG_PATH a correct include directory is set, and
the latest rebased tci-next branch now works for me with a compiler warning:

/opt/homebrew/Cellar/libffi/3.3_2/include/ffi.h:441:5: warning:
'FFI_GO_CLOSURES' is not defined, evaluates to 0 [-Wundef]

Stefan
Peter Maydell Feb. 8, 2021, 9:20 a.m. UTC | #6
On Sun, 7 Feb 2021 at 20:12, Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 2/7/21 11:52 AM, Peter Maydell wrote:

> > On Sun, 7 Feb 2021 at 17:41, Richard Henderson

> > <richard.henderson@linaro.org> wrote:

> >>

> >> On 2/7/21 8:25 AM, Stefan Weil wrote:

> >>>> +#include "qemu-common.h"

> >>>> +#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */

> >>>> +#include "exec/cpu_ldst.h"

> >>>> +#include "tcg/tcg-op.h"

> >>>> +#include "qemu/compiler.h"

> >>>> +#include <ffi.h>

> >>>> +

> >>>

> >>>

> >>> ffi.h is not found on macOS with Homebrew.

> >>>

> >>> This can be fixed by using pkg-config to find the right compiler (and maybe

> >>> also linker) flags:

> >>>

> >>> % pkg-config --cflags libffi

> >>> -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

> >>> % pkg-config --libs libffi

> >>> -lffi

> >>

> >>

> >> Which is exactly what I do in the previous patch:

> >>

> >>

> >>> +++ b/meson.build

> >>> @@ -1901,7 +1901,14 @@ specific_ss.add(when: 'CONFIG_TCG', if_true: files(

> >>>    'tcg/tcg-op.c',

> >>>    'tcg/tcg.c',

> >>>  ))

> >>> -specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))

> >>> +

> >>> +if get_option('tcg_interpreter')

> >>> +  libffi = dependency('libffi', version: '>=3.0',

> >>> +                      static: enable_static, method: 'pkg-config',

> >>> +                      required: true)

> >>> +  specific_ss.add(libffi)

> >>> +  specific_ss.add(files('tcg/tci.c'))

> >>> +endif

> >>

> >> Did you need a PKG_CONFIG_LIBDIR set for homebrew?

> >

> > Is this the "meson doesn't actually add the cflags everywhere it should"

> > bug again ?

>

> I guess so.  I realized after sending this reply that PKG_CONFIG_LIBDIR can't

> be the answer, since the original configure should have failed if pkg-config

> didn't find ffi.

>

> Was there a resolution to said meson bug?


There's a workaround involving adding the library to the dependencies
list in the right places -- commit 3eacf70bb5a83e4 did this for gnutls.
Paolo may be able to help further.

thanks
-- PMM
Paolo Bonzini Feb. 8, 2021, 9:35 a.m. UTC | #7
On 08/02/21 10:20, Peter Maydell wrote:
>>>> +

>>>> +if get_option('tcg_interpreter')

>>>> +  libffi = dependency('libffi', version: '>=3.0',

>>>> +                      static: enable_static, method: 'pkg-config',

>>>> +                      required: true)

>>>> +  specific_ss.add(libffi)

>>>> +  specific_ss.add(files('tcg/tci.c'))

>>>> +endif

>>> Did you need a PKG_CONFIG_LIBDIR set for homebrew?

>> Is this the "meson doesn't actually add the cflags everywhere it should"

>> bug again ?


No, it shouldn't be the same bug.  In this case the dependency is not 
indirect dependency, specific_ss is included directly:

   target_specific = specific_ss.apply(config_target, strict: false)
   arch_srcs += target_specific.sources()
   arch_deps += target_specific.dependencies()

   lib = static_library('qemu-' + target,
                  sources: arch_srcs + genh,
                  dependencies: arch_deps,
                  objects: objects,
                  include_directories: target_inc,
                  c_args: c_args,
                  build_by_default: false,
                  name_suffix: 'fa')

It's more likely to be what Stefan pointed out later:

> Meanwhile I noticed an additional detail:

> 

> There exist two different pkg-config configurations for libffi on Homebrew:

> 

> % pkg-config --cflags libffi

> -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

> % export PKG_CONFIG_PATH="/opt/homebrew/opt/libffi/lib/pkgconfig"

> % pkg-config --cflags libffi

> -I/opt/homebrew/Cellar/libffi/3.3_2/include

>

> By default it points to a system directory which does not exist at all

> on my Mac, so that will never work.


Paolo
Stefan Weil Feb. 8, 2021, 1:07 p.m. UTC | #8
Am 08.02.21 um 10:35 schrieb Paolo Bonzini:

> On 08/02/21 10:20, Peter Maydell wrote:

>>>>> +

>>>>> +if get_option('tcg_interpreter')

>>>>> +  libffi = dependency('libffi', version: '>=3.0',

>>>>> +                      static: enable_static, method: 'pkg-config',

>>>>> +                      required: true)

>>>>> +  specific_ss.add(libffi)

>>>>> +  specific_ss.add(files('tcg/tci.c'))

>>>>> +endif

>>>> Did you need a PKG_CONFIG_LIBDIR set for homebrew?

>>> Is this the "meson doesn't actually add the cflags everywhere it 

>>> should"

>>> bug again ?

>

> No, it shouldn't be the same bug.  In this case the dependency is not 

> indirect dependency, specific_ss is included directly:

>

>   target_specific = specific_ss.apply(config_target, strict: false)

>   arch_srcs += target_specific.sources()

>   arch_deps += target_specific.dependencies()

>

>   lib = static_library('qemu-' + target,

>                  sources: arch_srcs + genh,

>                  dependencies: arch_deps,

>                  objects: objects,

>                  include_directories: target_inc,

>                  c_args: c_args,

>                  build_by_default: false,

>                  name_suffix: 'fa')

>

> It's more likely to be what Stefan pointed out later:

>

>> Meanwhile I noticed an additional detail:

>>

>> There exist two different pkg-config configurations for libffi on 

>> Homebrew:

>>

>> % pkg-config --cflags libffi

>> -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi

>> % export PKG_CONFIG_PATH="/opt/homebrew/opt/libffi/lib/pkgconfig"

>> % pkg-config --cflags libffi

>> -I/opt/homebrew/Cellar/libffi/3.3_2/include

>>

>> By default it points to a system directory which does not exist at all

>> on my Mac, so that will never work.

>

> Paolo



Yes, it looks like setting the right PKG_CONFIG_PATH is sufficient for 
builds with Homebrew on macOS.

Richard, this commit is also the one which breaks qemu-system-i386 on 
sparc64 for me:

sw@gcc102:~/src/gitlab/qemu-project/qemu$ git bisect good
115a01c323e6a01902894ec23ba704bf3dc8215a is the first bad commit
commit 115a01c323e6a01902894ec23ba704bf3dc8215a
Author: Richard Henderson <richard.henderson@linaro.org>
Date:   Sat Jan 30 14:24:25 2021 -0800

     tcg/tci: Use ffi for calls

     This requires adjusting where arguments are stored.
     Place them on the stack at left-aligned positions.
     Adjust the stack frame to be at entirely positive offsets.

     Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

  include/tcg/tcg.h        |   1 +
  tcg/tcg.c                |  72 +++++++++++++++-----------
  tcg/tci.c                | 131 
++++++++++++++++++++++++++---------------------
  tcg/tci/tcg-target.c.inc |  50 +++++++++---------
  tcg/tci/tcg-target.h     |   2 +-
  5 files changed, 143 insertions(+), 113 deletions(-)

Regards,

Stefan
Richard Henderson Feb. 8, 2021, 5:39 p.m. UTC | #9
On 2/8/21 5:07 AM, Stefan Weil wrote:
> Richard, this commit is also the one which breaks qemu-system-i386 on sparc64

> for me:


You'll have to give me more details than that, because qemu-system-i386 works
for me on a niagara5 w/ debian sid.


r~
Stefan Weil Feb. 8, 2021, 7:04 p.m. UTC | #10
Am 08.02.21 um 18:39 schrieb Richard Henderson:
> On 2/8/21 5:07 AM, Stefan Weil wrote:

>> Richard, this commit is also the one which breaks qemu-system-i386 on sparc64

>> for me:

> You'll have to give me more details than that, because qemu-system-i386 works

> for me on a niagara5 w/ debian sid.



I am testing on a similar Debian system (debian-ports unstable), but 
with a Niagara3 cpu:

Linux gcc102.fsffrance.org 5.10.0-3-sparc64-smp #1 SMP Debian 5.10.12-1 
(2021-01-30) sparc64 GNU/Linux

gcc (Debian 10.2.1-6) 10.2.1 20210110

$ cat /proc/cpuinfo
cpu        : UltraSparc T3 (Niagara3)
fpu        : UltraSparc T3 integrated FPU
pmu        : niagara3
prom        : OBP 4.34.6.c 2017/03/22 13:55
type        : sun4v
ncpus probed    : 256
ncpus active    : 256
D$ parity tl1    : 0
I$ parity tl1    : 0
cpucaps        : 
flush,stbar,swap,muldiv,v9,blkinit,n2,mul32,div32,v8plus,popc,vis,vis2,ASIBlkInit,fmaf,vis3,hpc
[...]

During "git bisect" I had to apply fixes for unaligned memory access as 
your series starts with TCI code which does not include my patch for 
that. The first bisect step was tested with your tci-next branch, the 
last step was tested with my bisect-tci branch 
(https://gitlab.com/stweil/qemu/-/commits/bisect-tci).

Stefan
Richard Henderson Feb. 8, 2021, 10:55 p.m. UTC | #11
On 2/8/21 11:04 AM, Stefan Weil wrote:
> 

> Am 08.02.21 um 18:39 schrieb Richard Henderson:

>> On 2/8/21 5:07 AM, Stefan Weil wrote:

>>> Richard, this commit is also the one which breaks qemu-system-i386 on sparc64

>>> for me:

>> You'll have to give me more details than that, because qemu-system-i386 works

>> for me on a niagara5 w/ debian sid.

> 

> 

> I am testing on a similar Debian system (debian-ports unstable), but with a

> Niagara3 cpu:

> 

> Linux gcc102.fsffrance.org 5.10.0-3-sparc64-smp #1 SMP Debian 5.10.12-1

> (2021-01-30) sparc64 GNU/Linux

> 

> gcc (Debian 10.2.1-6) 10.2.1 20210110

> 

> $ cat /proc/cpuinfo

> cpu        : UltraSparc T3 (Niagara3)

> fpu        : UltraSparc T3 integrated FPU

> pmu        : niagara3

> prom        : OBP 4.34.6.c 2017/03/22 13:55

> type        : sun4v

> ncpus probed    : 256

> ncpus active    : 256

> D$ parity tl1    : 0

> I$ parity tl1    : 0

> cpucaps        :

> flush,stbar,swap,muldiv,v9,blkinit,n2,mul32,div32,v8plus,popc,vis,vis2,ASIBlkInit,fmaf,vis3,hpc

> 


Ok, I've reproduced something on a T3 (gcc102.fsffrance.org).
Running the same code side-by-side vs the T5, I get different results.

I'll see if I can track down the difference, since they're both running the
same base os.


r~
Richard Henderson Feb. 9, 2021, 8:46 p.m. UTC | #12
On 2/8/21 2:55 PM, Richard Henderson wrote:
> Ok, I've reproduced something on a T3 (gcc102.fsffrance.org).

> Running the same code side-by-side vs the T5, I get different results.


Brown paper bag time: the T5 build dir lost the --enable-tcg-interpreter flag,
so was testing tcg native.

Big-endian bug wrt an odd api wart in libffi.  Fixed thus.


r~
diff --git a/tcg/tci.c b/tcg/tci.c
index d27db9f720..dd0cca296a 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -557,8 +557,15 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             case 0: /* void */
                 break;
             case 1: /* uint32_t */
-                regs[TCG_REG_R0] = *(uint32_t *)stack;
-                break;
+                /*
+                 * Note that libffi has an odd special case in that it will
+                 * always widen an integral result to ffi_arg.
+                 */
+                if (sizeof(ffi_arg) == 4) {
+                    regs[TCG_REG_R0] = *(uint32_t *)stack;
+                    break;
+                }
+                /* fall through */
             case 2: /* uint64_t */
                 if (TCG_TARGET_REG_BITS == 32) {
                     tci_write_reg64(regs, TCG_REG_R1, TCG_REG_R0, stack[0]);
Stefan Weil Feb. 9, 2021, 9:15 p.m. UTC | #13
Am 09.02.21 um 21:46 schrieb Richard Henderson:

> On 2/8/21 2:55 PM, Richard Henderson wrote:

>> Ok, I've reproduced something on a T3 (gcc102.fsffrance.org).

>> Running the same code side-by-side vs the T5, I get different results.

> Brown paper bag time: the T5 build dir lost the --enable-tcg-interpreter flag,

> so was testing tcg native.

>

> Big-endian bug wrt an odd api wart in libffi.  Fixed thus.



Thanks for solving this. The patch works for me.

BIOS boot time with qemu-system-i386 is about 41 s (with my code which 
lacks thread support and ffi it was 40 s).

With qemu-system-x86_64 it is twice as fast, so it looks like in my last 
report where I said that the new code had doubled the speed I compared 
different system emulations.

Apropos "slow" TCI: on Apple's M1 it is faster than native TCG on most 
of my Intel / AMD machines. And it works with current git master while 
native TCG still waits for pending patches which fix the memory access.

Stefan
Stefan Weil Feb. 9, 2021, 9:54 p.m. UTC | #14
Am 09.02.21 um 22:15 schrieb Stefan Weil:

>

> Thanks for solving this. The patch works for me.

>

> BIOS boot time with qemu-system-i386 is about 41 s (with my code which 

> lacks thread support and ffi it was 40 s).

>

> With qemu-system-x86_64 it is twice as fast, so it looks like in my 

> last report where I said that the new code had doubled the speed I 

> compared different system emulations.



Update: with Richard's latest tci-next branch which includes the fixed 
code both qemu-system-x86_64 and qemu-system-i386 require about 20 s 
user time for the BIOS boot.

Stefan
diff mbox series

Patch

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0f0695e90d..e5573a9877 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -53,6 +53,7 @@ 
 #define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
 
 #define CPU_TEMP_BUF_NLONGS 128
+#define TCG_STATIC_FRAME_SIZE  (CPU_TEMP_BUF_NLONGS * sizeof(long))
 
 /* Default target word size to pointer size.  */
 #ifndef TCG_TARGET_REG_BITS
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 52af6d8bc5..4df10e2e83 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -161,7 +161,7 @@  typedef enum {
 
 /* Used for function call generation. */
 #define TCG_TARGET_CALL_STACK_OFFSET    0
-#define TCG_TARGET_STACK_ALIGN          16
+#define TCG_TARGET_STACK_ALIGN          8
 
 #define HAVE_TCG_QEMU_TB_EXEC
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 6382112215..92aec0d238 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -208,6 +208,18 @@  static size_t tree_size;
 static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
 static TCGRegSet tcg_target_call_clobber_regs;
 
+typedef struct TCGHelperInfo {
+    void *func;
+#ifdef CONFIG_TCG_INTERPRETER
+    ffi_cif *cif;
+#endif
+    const char *name;
+    unsigned flags;
+    unsigned sizemask;
+} TCGHelperInfo;
+
+static GHashTable *helper_table;
+
 #if TCG_TARGET_INSN_UNIT_SIZE == 1
 static __attribute__((unused)) inline void tcg_out8(TCGContext *s, uint8_t v)
 {
@@ -1084,16 +1096,6 @@  void tcg_pool_reset(TCGContext *s)
     s->pool_current = NULL;
 }
 
-typedef struct TCGHelperInfo {
-    void *func;
-#ifdef CONFIG_TCG_INTERPRETER
-    ffi_cif *cif;
-#endif
-    const char *name;
-    unsigned flags;
-    unsigned sizemask;
-} TCGHelperInfo;
-
 #include "exec/helper-proto.h"
 
 #ifdef CONFIG_TCG_INTERPRETER
@@ -1103,7 +1105,6 @@  typedef struct TCGHelperInfo {
 static const TCGHelperInfo all_helpers[] = {
 #include "exec/helper-tcg.h"
 };
-static GHashTable *helper_table;
 
 static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
 static void process_op_defs(TCGContext *s);
@@ -2081,25 +2082,38 @@  void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 
     real_args = 0;
     for (i = 0; i < nargs; i++) {
-        int is_64bit = sizemask & (1 << (i+1)*2);
-        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
-#ifdef TCG_TARGET_CALL_ALIGN_ARGS
-            /* some targets want aligned 64 bit args */
-            if (real_args & 1) {
-                op->args[pi++] = TCG_CALL_DUMMY_ARG;
-                real_args++;
-            }
+        bool is_64bit = sizemask & (1 << (i+1)*2);
+        bool want_align = false;
+
+#if defined(CONFIG_TCG_INTERPRETER)
+        /*
+         * Align all arguments, so that they land in predictable places
+         * for passing off to ffi_call.
+         */
+        want_align = true;
+#elif defined(TCG_TARGET_CALL_ALIGN_ARGS)
+        /* Some targets want aligned 64 bit args */
+        want_align = is_64bit;
 #endif
-           /* If stack grows up, then we will be placing successive
-              arguments at lower addresses, which means we need to
-              reverse the order compared to how we would normally
-              treat either big or little-endian.  For those arguments
-              that will wind up in registers, this still works for
-              HPPA (the only current STACK_GROWSUP target) since the
-              argument registers are *also* allocated in decreasing
-              order.  If another such target is added, this logic may
-              have to get more complicated to differentiate between
-              stack arguments and register arguments.  */
+
+        if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
+            op->args[pi++] = TCG_CALL_DUMMY_ARG;
+            real_args++;
+        }
+
+        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
+           /*
+            * If stack grows up, then we will be placing successive
+            * arguments at lower addresses, which means we need to
+            * reverse the order compared to how we would normally
+            * treat either big or little-endian.  For those arguments
+            * that will wind up in registers, this still works for
+            * HPPA (the only current STACK_GROWSUP target) since the
+            * argument registers are *also* allocated in decreasing
+            * order.  If another such target is added, this logic may
+            * have to get more complicated to differentiate between
+            * stack arguments and register arguments.
+            */
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
             op->args[pi++] = temp_arg(args[i] + 1);
             op->args[pi++] = temp_arg(args[i]);
diff --git a/tcg/tci.c b/tcg/tci.c
index 6843e837ae..d27db9f720 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -18,6 +18,13 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */
+#include "exec/cpu_ldst.h"
+#include "tcg/tcg-op.h"
+#include "qemu/compiler.h"
+#include <ffi.h>
+
 
 /* Enable TCI assertions only when debugging TCG (and without NDEBUG defined).
  * Without assertions, the interpreter runs much faster. */
@@ -27,36 +34,8 @@ 
 # define tci_assert(cond) ((void)(cond))
 #endif
 
-#include "qemu-common.h"
-#include "tcg/tcg.h"           /* MAX_OPC_PARAM_IARGS */
-#include "exec/cpu_ldst.h"
-#include "tcg/tcg-op.h"
-#include "qemu/compiler.h"
-
-#if MAX_OPC_PARAM_IARGS != 6
-# error Fix needed, number of supported input arguments changed!
-#endif
-#if TCG_TARGET_REG_BITS == 32
-typedef uint64_t (*helper_function)(tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong);
-#else
-typedef uint64_t (*helper_function)(tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong,
-                                    tcg_target_ulong, tcg_target_ulong);
-#endif
-
 __thread uintptr_t tci_tb_ptr;
 
-static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
-{
-    tci_assert(index < TCG_TARGET_NB_REGS);
-    return regs[index];
-}
-
 static void
 tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
 {
@@ -131,6 +110,7 @@  static tcg_target_ulong tci_read_label(const uint8_t **tb_ptr)
  *   i = immediate (uint32_t)
  *   I = immediate (tcg_target_ulong)
  *   m = immediate (TCGMemOpIdx)
+ *   n = immediate (call return length)
  *   r = register
  *   s = signed ldst offset
  */
@@ -151,6 +131,16 @@  static void tci_args_l(const uint8_t **tb_ptr, void **l0)
     check_size(start, tb_ptr);
 }
 
+static void tci_args_nl(const uint8_t **tb_ptr, uint8_t *n0, void **l1)
+{
+    const uint8_t *start = *tb_ptr;
+
+    *n0 = tci_read_b(tb_ptr);
+    *l1 = (void *)tci_read_label(tb_ptr);
+
+    check_size(start, tb_ptr);
+}
+
 static void tci_args_rr(const uint8_t **tb_ptr,
                         TCGReg *r0, TCGReg *r1)
 {
@@ -491,6 +481,7 @@  static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
 # define CASE_64(x)
 #endif
 
+
 /* Interpret pseudo code in tb. */
 /*
  * Disable CFI checks.
@@ -502,11 +493,13 @@  uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 {
     const uint8_t *tb_ptr = v_tb_ptr;
     tcg_target_ulong regs[TCG_TARGET_NB_REGS];
-    long tcg_temps[CPU_TEMP_BUF_NLONGS];
-    uintptr_t sp_value = (uintptr_t)(tcg_temps + CPU_TEMP_BUF_NLONGS);
+    uint64_t stack[(TCG_STATIC_CALL_ARGS_SIZE + TCG_STATIC_FRAME_SIZE)
+                   / sizeof(uint64_t)];
+    void *call_slots[TCG_STATIC_CALL_ARGS_SIZE / sizeof(uint64_t)];
 
     regs[TCG_AREG0] = (tcg_target_ulong)env;
-    regs[TCG_REG_CALL_STACK] = sp_value;
+    regs[TCG_REG_CALL_STACK] = (uintptr_t)stack;
+    call_slots[0] = NULL;
     tci_assert(tb_ptr);
 
     for (;;) {
@@ -531,33 +524,53 @@  uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
 
         switch (opc) {
         case INDEX_op_call:
-            tci_args_l(&tb_ptr, &ptr);
+            /*
+             * We are passed a pointer to the TCGHelperInfo, which contains
+             * the function pointer followed by the ffi_cif pointer.
+             */
+            tci_args_nl(&tb_ptr, &len, &ptr);
+
+            /* Helper functions may need to access the "return address" */
             tci_tb_ptr = (uintptr_t)tb_ptr;
-#if TCG_TARGET_REG_BITS == 32
-            tmp64 = ((helper_function)ptr)(tci_read_reg(regs, TCG_REG_R0),
-                                           tci_read_reg(regs, TCG_REG_R1),
-                                           tci_read_reg(regs, TCG_REG_R2),
-                                           tci_read_reg(regs, TCG_REG_R3),
-                                           tci_read_reg(regs, TCG_REG_R4),
-                                           tci_read_reg(regs, TCG_REG_R5),
-                                           tci_read_reg(regs, TCG_REG_R6),
-                                           tci_read_reg(regs, TCG_REG_R7),
-                                           tci_read_reg(regs, TCG_REG_R8),
-                                           tci_read_reg(regs, TCG_REG_R9),
-                                           tci_read_reg(regs, TCG_REG_R10),
-                                           tci_read_reg(regs, TCG_REG_R11));
-            tci_write_reg(regs, TCG_REG_R0, tmp64);
-            tci_write_reg(regs, TCG_REG_R1, tmp64 >> 32);
-#else
-            tmp64 = ((helper_function)ptr)(tci_read_reg(regs, TCG_REG_R0),
-                                           tci_read_reg(regs, TCG_REG_R1),
-                                           tci_read_reg(regs, TCG_REG_R2),
-                                           tci_read_reg(regs, TCG_REG_R3),
-                                           tci_read_reg(regs, TCG_REG_R4),
-                                           tci_read_reg(regs, TCG_REG_R5));
-            tci_write_reg(regs, TCG_REG_R0, tmp64);
-#endif
+
+            /*
+             * Set up the ffi_avalue array once, delayed until now
+             * because many TB's do not make any calls. In tcg_gen_callN,
+             * we arranged for every real argument to be "left-aligned"
+             * in each 64-bit slot.
+             */
+            if (call_slots[0] == NULL) {
+                for (int i = 0; i < ARRAY_SIZE(call_slots); ++i) {
+                    call_slots[i] = &stack[i];
+                }
+            }
+
+            /*
+             * Call the helper function.  Any result winds up
+             * "left-aligned" in the stack[0] slot.
+             */
+            {
+                void **pptr = ptr;
+                ffi_call(pptr[1], pptr[0], stack, call_slots);
+            }
+            switch (len) {
+            case 0: /* void */
+                break;
+            case 1: /* uint32_t */
+                regs[TCG_REG_R0] = *(uint32_t *)stack;
+                break;
+            case 2: /* uint64_t */
+                if (TCG_TARGET_REG_BITS == 32) {
+                    tci_write_reg64(regs, TCG_REG_R1, TCG_REG_R0, stack[0]);
+                } else {
+                    regs[TCG_REG_R0] = stack[0];
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
             break;
+
         case INDEX_op_br:
             tci_args_l(&tb_ptr, &ptr);
             tb_ptr = ptr;
@@ -1162,13 +1175,17 @@  int print_insn_tci(bfd_vma addr, disassemble_info *info)
 
     switch (op) {
     case INDEX_op_br:
-    case INDEX_op_call:
     case INDEX_op_exit_tb:
     case INDEX_op_goto_tb:
         tci_args_l(&tb_ptr, &ptr);
         info->fprintf_func(info->stream, "%-12s  %p", op_name, ptr);
         break;
 
+    case INDEX_op_call:
+        tci_args_nl(&tb_ptr, &len, &ptr);
+        info->fprintf_func(info->stream, "%-12s  %d,%p", op_name, len, ptr);
+        break;
+
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
         tci_args_rrcl(&tb_ptr, &r0, &r1, &c, &ptr);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index 7fb3b04eaf..8d75482546 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -192,23 +192,8 @@  static const int tcg_target_reg_alloc_order[] = {
 # error Fix needed, number of supported input arguments changed!
 #endif
 
-static const int tcg_target_call_iarg_regs[] = {
-    TCG_REG_R0,
-    TCG_REG_R1,
-    TCG_REG_R2,
-    TCG_REG_R3,
-    TCG_REG_R4,
-    TCG_REG_R5,
-#if TCG_TARGET_REG_BITS == 32
-    /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-#endif
-};
+/* No call arguments via registers.  All will be stored on the "stack". */
+static const int tcg_target_call_iarg_regs[] = { };
 
 static const int tcg_target_call_oarg_regs[] = {
     TCG_REG_R0,
@@ -292,8 +277,9 @@  static void tci_out_label(TCGContext *s, TCGLabel *label)
 static void stack_bounds_check(TCGReg base, target_long offset)
 {
     if (base == TCG_REG_CALL_STACK) {
-        tcg_debug_assert(offset < 0);
-        tcg_debug_assert(offset >= -(CPU_TEMP_BUF_NLONGS * sizeof(long)));
+        tcg_debug_assert(offset >= 0);
+        tcg_debug_assert(offset < (TCG_STATIC_CALL_ARGS_SIZE +
+                                   TCG_STATIC_FRAME_SIZE));
     }
 }
 
@@ -360,11 +346,25 @@  static void tcg_out_movi(TCGContext *s, TCGType type,
     old_code_ptr[1] = s->code_ptr - old_code_ptr;
 }
 
-static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 {
     uint8_t *old_code_ptr = s->code_ptr;
+    const TCGHelperInfo *info;
+    uint8_t which;
+
+    info = g_hash_table_lookup(helper_table, (gpointer)arg);
+    if (info->cif->rtype == &ffi_type_void) {
+        which = 0;
+    } else if (info->cif->rtype->size == 4) {
+        which = 1;
+    } else {
+        tcg_debug_assert(info->cif->rtype->size == 8);
+        which = 2;
+    }
     tcg_out_op_t(s, INDEX_op_call);
-    tcg_out_i(s, (uintptr_t)arg);
+    tcg_out8(s, which);
+    tcg_out_i(s, (uintptr_t)info);
+
     old_code_ptr[1] = s->code_ptr - old_code_ptr;
 }
 
@@ -629,11 +629,9 @@  static void tcg_target_init(TCGContext *s)
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
 
-    /* We use negative offsets from "sp" so that we can distinguish
-       stores that might pretend to be call arguments.  */
-    tcg_set_frame(s, TCG_REG_CALL_STACK,
-                  -CPU_TEMP_BUF_NLONGS * sizeof(long),
-                  CPU_TEMP_BUF_NLONGS * sizeof(long));
+    /* The call arguments come first, followed by the temp storage. */
+    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
+                  TCG_STATIC_FRAME_SIZE);
 }
 
 /* Generate global QEMU prologue and epilogue code. */