diff mbox series

arm: Use _dl_find_object on __gnu_Unwind_Find_exidx (BZ 31405)

Message ID 20240222134255.2921498-1-adhemerval.zanella@linaro.org
State Accepted
Commit f4c142bb9fe6b02c0af8cfca8a920091e2dba44b
Headers show
Series arm: Use _dl_find_object on __gnu_Unwind_Find_exidx (BZ 31405) | expand

Commit Message

Adhemerval Zanella Netto Feb. 22, 2024, 1:42 p.m. UTC
Instead of __dl_iterate_phdr. On ARM dlfo_eh_frame/dlfo_eh_count
maps to PT_ARM_EXIDX vaddr start / length.

On a Neoverse N1 machine with 160 cores, the following program:

  $ cat test.c
  #include <stdlib.h>
  #include <pthread.h>
  #include <assert.h>

  enum {
    niter = 1024,
    ntimes = 128,
  };

  static void *
  tf (void *arg)
  {
    int a = (int) arg;

    for (int i = 0; i < niter; i++)
      {
        void *p[ntimes];
        for (int j = 0; j < ntimes; j++)
  	p[j] = malloc (a * 128);
        for (int j = 0; j < ntimes; j++)
  	free (p[j]);
      }

    return NULL;
  }

  int main (int argc, char *argv[])
  {
    enum { nthreads = 16 };
    pthread_t t[nthreads];

    for (int i = 0; i < nthreads; i ++)
      assert (pthread_create (&t[i], NULL, tf, (void *) i) == 0);

    for (int i = 0; i < nthreads; i++)
      {
        void *r;
        assert (pthread_join (t[i], &r) == 0);
        assert (r == NULL);
      }

    return 0;
  }
  $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test

Improves from ~15s to 0.5s.

Checked on arm-linux-gnueabihf.
---
 elf/Makefile             |  2 +-
 elf/dl-find_object.c     |  5 ++--
 include/dlfcn.h          |  3 ++-
 sysdeps/arm/find_exidx.c | 57 +++-------------------------------------
 4 files changed, 10 insertions(+), 57 deletions(-)

Comments

Florian Weimer Feb. 22, 2024, 2:54 p.m. UTC | #1
* Adhemerval Zanella:

> On a Neoverse N1 machine with 160 cores, the following program:

>     for (int i = 0; i < niter; i++)
>       {
>         void *p[ntimes];
>         for (int j = 0; j < ntimes; j++)
>   	p[j] = malloc (a * 128);
>         for (int j = 0; j < ntimes; j++)
>   	free (p[j]);
>       }
>
>     return NULL;
>   }

>   $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>
> Improves from ~15s to 0.5s.

Is this specific to 32-bit Arm, or does _dl_find_object provide a
similar performance benefit to multi-threaded Address Sanitizer on
GCC 12 with glibc 2.35, too?  I didn't expect that.
Adhemerval Zanella Netto Feb. 22, 2024, 4:47 p.m. UTC | #2
On 22/02/24 11:54, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> On a Neoverse N1 machine with 160 cores, the following program:
> 
>>     for (int i = 0; i < niter; i++)
>>       {
>>         void *p[ntimes];
>>         for (int j = 0; j < ntimes; j++)
>>   	p[j] = malloc (a * 128);
>>         for (int j = 0; j < ntimes; j++)
>>   	free (p[j]);
>>       }
>>
>>     return NULL;
>>   }
> 
>>   $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>>
>> Improves from ~15s to 0.5s.
> 
> Is this specific to 32-bit Arm, or does _dl_find_object provide a
> similar performance benefit to multi-threaded Address Sanitizer on
> GCC 12 with glibc 2.35, too?  I didn't expect that.

This is ARM specific afaik, libgcc/unwind-arm-common.inc calls 
__gnu_Unwind_Find_exidx (if available) on _Unwind_Backtrace
call (pretty much like BZ 31405 first comment). And this has
not been changed for a while, so both gcc 12 and master uses the
same code.
Florian Weimer Feb. 22, 2024, 5:25 p.m. UTC | #3
* Adhemerval Zanella Netto:

> On 22/02/24 11:54, Florian Weimer wrote:
>> * Adhemerval Zanella:
>> 
>>> On a Neoverse N1 machine with 160 cores, the following program:
>> 
>>>     for (int i = 0; i < niter; i++)
>>>       {
>>>         void *p[ntimes];
>>>         for (int j = 0; j < ntimes; j++)
>>>   	p[j] = malloc (a * 128);
>>>         for (int j = 0; j < ntimes; j++)
>>>   	free (p[j]);
>>>       }
>>>
>>>     return NULL;
>>>   }
>> 
>>>   $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>>>
>>> Improves from ~15s to 0.5s.
>> 
>> Is this specific to 32-bit Arm, or does _dl_find_object provide a
>> similar performance benefit to multi-threaded Address Sanitizer on
>> GCC 12 with glibc 2.35, too?  I didn't expect that.
>
> This is ARM specific afaik, libgcc/unwind-arm-common.inc calls 
> __gnu_Unwind_Find_exidx (if available) on _Unwind_Backtrace
> call (pretty much like BZ 31405 first comment). And this has
> not been changed for a while, so both gcc 12 and master uses the
> same code.

If Address Sanitizer calls _Unwind_Backtrace (perhaps via
libbacktrace) in malloc, this should impact DWARF targets as well: the
glibc 2.35/GCC 12 combination is more similar to Arm because the
dl_iterate_phdr call in the _Unwind_Backtrace implementation has been
replaced with _dl_find_object.

The patch is looks okay to me, by the way.  I had this in mind when
working on _dl_find_object, but couldn't test it at the time.
Adhemerval Zanella Netto Feb. 22, 2024, 5:54 p.m. UTC | #4
On 22/02/24 14:25, Florian Weimer wrote:
> * Adhemerval Zanella Netto:
> 
>> On 22/02/24 11:54, Florian Weimer wrote:
>>> * Adhemerval Zanella:
>>>
>>>> On a Neoverse N1 machine with 160 cores, the following program:
>>>
>>>>     for (int i = 0; i < niter; i++)
>>>>       {
>>>>         void *p[ntimes];
>>>>         for (int j = 0; j < ntimes; j++)
>>>>   	p[j] = malloc (a * 128);
>>>>         for (int j = 0; j < ntimes; j++)
>>>>   	free (p[j]);
>>>>       }
>>>>
>>>>     return NULL;
>>>>   }
>>>
>>>>   $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>>>>
>>>> Improves from ~15s to 0.5s.
>>>
>>> Is this specific to 32-bit Arm, or does _dl_find_object provide a
>>> similar performance benefit to multi-threaded Address Sanitizer on
>>> GCC 12 with glibc 2.35, too?  I didn't expect that.
>>
>> This is ARM specific afaik, libgcc/unwind-arm-common.inc calls 
>> __gnu_Unwind_Find_exidx (if available) on _Unwind_Backtrace
>> call (pretty much like BZ 31405 first comment). And this has
>> not been changed for a while, so both gcc 12 and master uses the
>> same code.
> 
> If Address Sanitizer calls _Unwind_Backtrace (perhaps via
> libbacktrace) in malloc, this should impact DWARF targets as well: the
> glibc 2.35/GCC 12 combination is more similar to Arm because the
> dl_iterate_phdr call in the _Unwind_Backtrace implementation has been
> replaced with _dl_find_object.

The _Unwind_Backtrace depends on then default unwind strategy (Fast/Slow) 
which can be overridden by a environment variable 
(fast_unwind_on_malloc={0,1}).

The fast unwinder strategy (which tries to work the callstack by parsing 
the frame pointer) usually works if you used only one compiler/mode  
building everything (for instance if you build everything with clang -marm); 
but it start to hit some corner cases when you mix compilers and modes:

compiler-rt/lib/sanitizer_common/sanitizer_flags.inc

 43 // ARM thumb/thumb2 frame pointer is inconsistent on GCC and Clang [1]
 44 // and fast-unwider is also unreliable with mixing arm and thumb code [2].
 45 // [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92172
 46 // [2] https://bugs.llvm.org/show_bug.cgi?id=44158
 47 COMMON_FLAG(bool, fast_unwind_on_malloc,
 48             !(SANITIZER_LINUX && !SANITIZER_ANDROID && SANITIZER_ARM),
 49             "If available, use the fast frame-pointer-based unwinder on "
 50             "malloc/free.")

> 
> The patch is looks okay to me, by the way.  I had this in mind when
> working on _dl_find_object, but couldn't test it at the time.

Thanks, I will install it.
diff mbox series

Patch

diff --git a/elf/Makefile b/elf/Makefile
index 5d78b659ce..36c04baf02 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -34,6 +34,7 @@  routines = \
   dl-addr \
   dl-addr-obj \
   dl-early_allocate \
+  dl-find_object \
   dl-iteratephdr \
   dl-libc \
   dl-origin \
@@ -60,7 +61,6 @@  dl-routines = \
   dl-deps \
   dl-exception \
   dl-execstack \
-  dl-find_object \
   dl-fini \
   dl-init \
   dl-load \
diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c
index 940fa5c223..449302eda3 100644
--- a/elf/dl-find_object.c
+++ b/elf/dl-find_object.c
@@ -356,7 +356,7 @@  _dlfo_lookup (uintptr_t pc, struct dl_find_object_internal *first1, size_t size)
 }
 
 int
-_dl_find_object (void *pc1, struct dl_find_object *result)
+__dl_find_object (void *pc1, struct dl_find_object *result)
 {
   uintptr_t pc = (uintptr_t) pc1;
 
@@ -463,7 +463,8 @@  _dl_find_object (void *pc1, struct dl_find_object *result)
         return -1;
     } /* Transaction retry loop.  */
 }
-rtld_hidden_def (_dl_find_object)
+hidden_def (__dl_find_object)
+weak_alias (__dl_find_object, _dl_find_object)
 
 /* _dlfo_process_initial is called twice.  First to compute the array
    sizes from the initial loaded mappings.  Second to fill in the
diff --git a/include/dlfcn.h b/include/dlfcn.h
index a44420fa37..f49ee1b0c9 100644
--- a/include/dlfcn.h
+++ b/include/dlfcn.h
@@ -4,7 +4,8 @@ 
 #include <link.h>		/* For ElfW.  */
 #include <stdbool.h>
 
-rtld_hidden_proto (_dl_find_object)
+extern __typeof (_dl_find_object) __dl_find_object;
+hidden_proto (__dl_find_object)
 
 /* Internally used flag.  */
 #define __RTLD_DLOPEN	0x80000000
diff --git a/sysdeps/arm/find_exidx.c b/sysdeps/arm/find_exidx.c
index d647865e5a..a924d59b9f 100644
--- a/sysdeps/arm/find_exidx.c
+++ b/sysdeps/arm/find_exidx.c
@@ -16,64 +16,15 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <link.h>
-#include <unwind.h>
-
-struct unw_eh_callback_data
-{
-  _Unwind_Ptr pc;
-  _Unwind_Ptr exidx_start;
-  int exidx_len;
-};
-
-
-/* Callback to determines if the PC lies within an object, and remember the
-   location of the exception index table if it does.  */
-
-static int
-find_exidx_callback (struct dl_phdr_info * info, size_t size, void * ptr)
-{
-  struct unw_eh_callback_data * data;
-  const ElfW(Phdr) *phdr;
-  int i;
-  int match;
-  _Unwind_Ptr load_base;
-
-  data = (struct unw_eh_callback_data *) ptr;
-  load_base = info->dlpi_addr;
-  phdr = info->dlpi_phdr;
-
-  match = 0;
-  for (i = info->dlpi_phnum; i > 0; i--, phdr++)
-    {
-      if (phdr->p_type == PT_LOAD)
-        {
-          _Unwind_Ptr vaddr = phdr->p_vaddr + load_base;
-          if (data->pc >= vaddr && data->pc < vaddr + phdr->p_memsz)
-            match = 1;
-        }
-      else if (phdr->p_type == PT_ARM_EXIDX)
-	{
-	  data->exidx_start = (_Unwind_Ptr) (phdr->p_vaddr + load_base);
-	  data->exidx_len = phdr->p_memsz;
-	}
-    }
-
-  return match;
-}
-
 
 /* Find the exception index table containing PC.  */
 
 _Unwind_Ptr
 __gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount)
 {
-  struct unw_eh_callback_data data;
-
-  data.pc = pc;
-  data.exidx_start = 0;
-  if (__dl_iterate_phdr (find_exidx_callback, &data) <= 0)
+  struct dl_find_object data;
+  if (__dl_find_object ((void *) pc, &data) < 0)
     return 0;
-
-  *pcount = data.exidx_len / 8;
-  return data.exidx_start;
+  *pcount = data.dlfo_eh_count;
+  return (_Unwind_Ptr) data.dlfo_eh_frame;
 }