diff mbox series

[v2,1/5] mips: Do not malloc on getdents64 fallback

Message ID 20190731183136.21545-1-adhemerval.zanella@linaro.org
State New
Headers show
Series [v2,1/5] mips: Do not malloc on getdents64 fallback | expand

Commit Message

Adhemerval Zanella Netto July 31, 2019, 6:31 p.m. UTC
This patch changes how the fallback getdents64 implementation calls
non-LFS getdents by replacing the scratch_buffer with static buffer
plus a loop on getdents calls.  This avoids the potential malloc
call on scratch_buffer_set_array_size for large input buffer size
at the cost of more getdents syscalls.

It also adds a small optimization for older kernels, where the first
ENOSYS failure for getdents64 disable subsequent calls.

Check the dirent tests on a mips64-linux-gnu with getdents64 code
disabled.

	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
	Add small optimization for older kernel to avoid issuing
	__NR_getdents64 on each call and replace scratch_buffer usage with
	a static allocated buffer.
---
 .../unix/sysv/linux/mips/mips64/getdents64.c  | 122 ++++++++----------
 1 file changed, 54 insertions(+), 68 deletions(-)

-- 
2.17.1

Comments

Adhemerval Zanella Netto Aug. 28, 2019, 2:09 p.m. UTC | #1
Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> This patch changes how the fallback getdents64 implementation calls

> non-LFS getdents by replacing the scratch_buffer with static buffer

> plus a loop on getdents calls.  This avoids the potential malloc

> call on scratch_buffer_set_array_size for large input buffer size

> at the cost of more getdents syscalls.

> 

> It also adds a small optimization for older kernels, where the first

> ENOSYS failure for getdents64 disable subsequent calls.

> 

> Check the dirent tests on a mips64-linux-gnu with getdents64 code

> disabled.

> 

> 	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):

> 	Add small optimization for older kernel to avoid issuing

> 	__NR_getdents64 on each call and replace scratch_buffer usage with

> 	a static allocated buffer.

> ---

>  .../unix/sysv/linux/mips/mips64/getdents64.c  | 122 ++++++++----------

>  1 file changed, 54 insertions(+), 68 deletions(-)

> 

> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c

> index 8bf3abb0e0..3b5afd9324 100644

> --- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c

> +++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c

> @@ -22,98 +22,84 @@

>  #include <assert.h>

>  #include <sys/param.h>

>  #include <unistd.h>

> -#include <scratch_buffer.h>

>  #include <limits.h>

>  

>  ssize_t

> -__getdents64 (int fd, void *buf0, size_t nbytes)

> +__getdents64 (int fd, void *buf, size_t nbytes)

>  {

> -  char *buf = buf0;

> -

>    /* The system call takes an unsigned int argument, and some length

>       checks in the kernel use an int type.  */

>    if (nbytes > INT_MAX)

>      nbytes = INT_MAX;

>  

>  #ifdef __NR_getdents64

> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);

> -  if (ret != -1)

> -    return ret;

> +  static bool getdents64_supportted = true;

> +  if (atomic_load_relaxed (&getdents64_supportted))

> +    {

> +      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);

> +      if (ret >= 0 || errno != ENOSYS)

> +	return ret;

> +

> +      atomic_store_relaxed (&getdents64_supportted, false);

> +    }

>  #endif

>  

>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.

> -     If syscall is not available it need to fallback to non-LFS one.  */

> +     If the syscall is not available it need to fallback to the non-LFS one.

> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which

> +     would make the syscall non async-signal-safe) it uses a limited buffer.

> +     This is sub-optimal for large NBYTES, however this is a fallback

> +     mechanism to emulate a syscall that kernel should provide.   */

>  

> +  enum { KBUF_SIZE = 1024 };

>    struct kernel_dirent

> -    {

> -      unsigned long d_ino;

> -      unsigned long d_off;

> -      unsigned short int d_reclen;

> -      char d_name[256];

> -    };

> -

> -  const size_t size_diff = (offsetof (struct dirent64, d_name)

> -			   - offsetof (struct kernel_dirent, d_name));

> -

> -  size_t red_nbytes = MIN (nbytes

> -			   - ((nbytes / (offsetof (struct dirent64, d_name)

> -					 + 14)) * size_diff),

> -			   nbytes - size_diff);

> -

> -  struct scratch_buffer tmpbuf;

> -  scratch_buffer_init (&tmpbuf);

> -  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))

> -    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);

> -

> -  struct kernel_dirent *skdp, *kdp;

> -  skdp = kdp = tmpbuf.data;

> -

> -  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);

> -  if (retval == -1)

> -    {

> -      scratch_buffer_free (&tmpbuf);

> -      return -1;

> -    }

> +  {

> +    unsigned long d_ino;

> +    unsigned long d_off;

> +    unsigned short int d_reclen;

> +    char d_name[1];

> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

>  

> -  off64_t last_offset = -1;

>    struct dirent64 *dp = (struct dirent64 *) buf;

> -  while ((char *) kdp < (char *) skdp + retval)

> +

> +  size_t nb = 0;

> +  off64_t last_offset = -1;

> +

> +  ssize_t r;

> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)

>      {

> -      const size_t alignment = _Alignof (struct dirent64);

> -      /* Since kdp->d_reclen is already aligned for the kernel structure

> -	 this may compute a value that is bigger than necessary.  */

> -      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)

> -			   & ~(alignment - 1));

> -      if ((char *) dp + new_reclen > buf + nbytes)

> -        {

> -	  /* Our heuristic failed.  We read too many entries.  Reset

> -	     the stream.  */

> -	  assert (last_offset != -1);

> -	  __lseek64 (fd, last_offset, SEEK_SET);

> -

> -	  if ((char *) dp == buf)

> +      struct kernel_dirent *skdp, *kdp;

> +      skdp = kdp = kbuf;

> +

> +      while ((char *) kdp < (char *) skdp + r)

> +	{

> +	  const size_t alignment = _Alignof (struct dirent64);

> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)

> +			      & ~(alignment - 1));

> +	  if (nb + new_reclen > nbytes)

>  	    {

> -	      scratch_buffer_free (&tmpbuf);

> -	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);

> +		/* The new entry will overflow the input buffer, rewind to

> +		   last obtained entry and return.  */

> +	       __lseek64 (fd, last_offset, SEEK_SET);

> +	       goto out;

>  	    }

> +	  nb += new_reclen;

>  

> -	  break;

> -	}

> -

> -      last_offset = kdp->d_off;

> -      dp->d_ino = kdp->d_ino;

> -      dp->d_off = kdp->d_off;

> -      dp->d_reclen = new_reclen;

> -      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

> -      memcpy (dp->d_name, kdp->d_name,

> -	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));

> +	  dp->d_ino = kdp->d_ino;

> +	  dp->d_off = last_offset = kdp->d_off;

> +	  dp->d_reclen = new_reclen;

> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

> +	  memcpy (dp->d_name, kdp->d_name,

> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));

>  

> -      dp = (struct dirent64 *) ((char *) dp + new_reclen);

> -      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);

> +	  dp = (struct dirent64 *) ((char *) dp + new_reclen);

> +	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);

> +	}

>      }

>  

> -  scratch_buffer_free (&tmpbuf);

> -  return (char *) dp - buf;

> +out:

> +  return (char *) dp - (char *) buf;

>  }

>  libc_hidden_def (__getdents64)

>  weak_alias (__getdents64, getdents64)

>
Andreas Schwab Aug. 28, 2019, 2:34 p.m. UTC | #2
On Jul 31 2019, Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

>  #ifdef __NR_getdents64

> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);

> -  if (ret != -1)

> -    return ret;

> +  static bool getdents64_supportted = true;


s/supportted/supported/

Andreas.

-- 
Andreas Schwab, SUSE Labs, schwab@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."
Florian Weimer Aug. 28, 2019, 2:42 p.m. UTC | #3
* Adhemerval Zanella:

>    struct kernel_dirent

> +  {

> +    unsigned long d_ino;

> +    unsigned long d_off;

> +    unsigned short int d_reclen;

> +    char d_name[1];

> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];


I think it's still not clear to me in which cases we actually need to
move the dirent entries in the buffer.  My impression is that we just
need to move d_name by one byte because before, d_type was after the
name, and afterwards, it comes before the name.  But the record
boundaries are unchanged.

Thanks,
Florian
Adhemerval Zanella Netto Aug. 28, 2019, 5:01 p.m. UTC | #4
On 28/08/2019 11:34, Andreas Schwab wrote:
> On Jul 31 2019, Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

> 

>>  #ifdef __NR_getdents64

>> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);

>> -  if (ret != -1)

>> -    return ret;

>> +  static bool getdents64_supportted = true;

> 

> s/supportted/supported/

> 

> Andreas.

> 


Fixed, thanks.
Adhemerval Zanella Netto Aug. 28, 2019, 9:02 p.m. UTC | #5
On 28/08/2019 11:42, Florian Weimer wrote:
> * Adhemerval Zanella:

> 

>>    struct kernel_dirent

>> +  {

>> +    unsigned long d_ino;

>> +    unsigned long d_off;

>> +    unsigned short int d_reclen;

>> +    char d_name[1];

>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

> 

> I think it's still not clear to me in which cases we actually need to

> move the dirent entries in the buffer.  My impression is that we just

> need to move d_name by one byte because before, d_type was after the

> name, and afterwards, it comes before the name.  But the record

> boundaries are unchanged.


My understanding is the record boundary would be same as long the 
d_name fits on the alignment padding space minus the size of the
d_type.  Otherwise the dirent64 will need to be extended.

This leads to possible memmoves the whole buffer obtained from kernel
on which iteration (which for large buffer is another performance
drain) and possible lseek for the case where registers are slide out
the buffer.  I strongly think using an auxiliary buffer is still
simpler than operating in-place.

Also, mips64-n32 has another issue I found recently: it uses the
compat syscall which uses the compat dirent with both d_ino/d_off
as 32 bits.  For in-place getdents call it will require even more
memory moves. Below is an updated patch to fix it:

---

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..881b5eb651 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,98 +22,92 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static bool getdents64_supported = true;
+  if (atomic_load_relaxed (&getdents64_supported))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supported, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
+  enum { KBUF_SIZE = 1024 };
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
+  {
+#if _MIPS_SIM == _ABI64
+    uint64_t d_ino;
+    uint64_t d_off;
+#else
+    uint32_t d_ino;
+    uint32_t d_off;
+#endif
+    unsigned short int d_reclen;
+    char d_name[1];
+  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
   const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
-
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
-
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
-
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
+                           - offsetof (struct kernel_dirent, d_name));
 
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  struct dirent64 *dp = (struct dirent64 *) buf;
 
+  size_t nb = 0;
   off64_t last_offset = -1;
-  struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+
+  ssize_t r;
+  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      struct kernel_dirent *skdp, *kdp;
+      skdp = kdp = kbuf;
+
+      while ((char *) kdp < (char *) skdp + r)
+	{
+	  const size_t alignment = _Alignof (struct dirent64);
+	  size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
+			      & ~(alignment - 1));
+	  if (nb + new_reclen > nbytes)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+		/* The new entry will overflow the input buffer, rewind to
+		   last obtained entry and return.  */
+	       __lseek64 (fd, last_offset, SEEK_SET);
+	       goto out;
 	    }
+	  nb += new_reclen;
 
-	  break;
-	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
-      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
-      memcpy (dp->d_name, kdp->d_name,
-	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
+	  dp->d_ino = kdp->d_ino;
+	  dp->d_off = last_offset = kdp->d_off;
+	  dp->d_reclen = new_reclen;
+	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
+	  memcpy (dp->d_name, kdp->d_name,
+		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
 
-      dp = (struct dirent64 *) ((char *) dp + new_reclen);
-      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
+	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	}
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+out:
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)
Florian Weimer Aug. 28, 2019, 9:23 p.m. UTC | #6
* Adhemerval Zanella:

> On 28/08/2019 11:42, Florian Weimer wrote:

>> * Adhemerval Zanella:

>> 

>>>    struct kernel_dirent

>>> +  {

>>> +    unsigned long d_ino;

>>> +    unsigned long d_off;

>>> +    unsigned short int d_reclen;

>>> +    char d_name[1];

>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

>> 

>> I think it's still not clear to me in which cases we actually need to

>> move the dirent entries in the buffer.  My impression is that we just

>> need to move d_name by one byte because before, d_type was after the

>> name, and afterwards, it comes before the name.  But the record

>> boundaries are unchanged.

>

> My understanding is the record boundary would be same as long the 

> d_name fits on the alignment padding space minus the size of the

> d_type.  Otherwise the dirent64 will need to be extended.


Hmm.  The problem is mips64 n32, right?  Where unsigned long is 32 bits?

Thanks,
Florian
Adhemerval Zanella Netto Aug. 29, 2019, 11:04 a.m. UTC | #7
On 28/08/2019 18:23, Florian Weimer wrote:
> * Adhemerval Zanella:

> 

>> On 28/08/2019 11:42, Florian Weimer wrote:

>>> * Adhemerval Zanella:

>>>

>>>>    struct kernel_dirent

>>>> +  {

>>>> +    unsigned long d_ino;

>>>> +    unsigned long d_off;

>>>> +    unsigned short int d_reclen;

>>>> +    char d_name[1];

>>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

>>>

>>> I think it's still not clear to me in which cases we actually need to

>>> move the dirent entries in the buffer.  My impression is that we just

>>> need to move d_name by one byte because before, d_type was after the

>>> name, and afterwards, it comes before the name.  But the record

>>> boundaries are unchanged.

>>

>> My understanding is the record boundary would be same as long the 

>> d_name fits on the alignment padding space minus the size of the

>> d_type.  Otherwise the dirent64 will need to be extended.

> 

> Hmm.  The problem is mips64 n32, right?  Where unsigned long is 32 bits?


Yes, it would require either a mips64-n32 specific path or implementation.
Florian Weimer Aug. 30, 2019, 9:52 a.m. UTC | #8
Sorry, I missed that despite all the 64s in the patch, this code is also
used on 32-bit architectures.  The review below should take this new (to
me) piece of information into account.

* Adhemerval Zanella:

> +  static bool getdents64_supportted = true;

> +  if (atomic_load_relaxed (&getdents64_supportted))


Our atomics do not support bool, only 4 bytes and 8 bytes (if
__HAVE_64B_ATOMICS) is defined.  See __atomic_check_size.

Probably it will work if it compiles, but I haven't checked that.

>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.

> +     If the syscall is not available it need to fallback to the non-LFS one.

> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which

> +     would make the syscall non async-signal-safe) it uses a limited buffer.

> +     This is sub-optimal for large NBYTES, however this is a fallback

> +     mechanism to emulate a syscall that kernel should provide.   */

>  

> +  enum { KBUF_SIZE = 1024 };


The choice of size needs a comment.  I think the largest possible
practical length of the d_name member are 255 Unicode characters in the
BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
in reality.)

>    struct kernel_dirent

> +  {

> +    unsigned long d_ino;

> +    unsigned long d_off;

> +    unsigned short int d_reclen;

> +    char d_name[1];

> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;


I would define kbuf as a char array, and perhaps leave out the d_name
member in struct kernel_dirent.  You can copy out the struct
kernel_dirent using memcpy, which GCC should optimize away.

Ideally, we would perform the conversion in-line, with a forward scan to
make the d_reclen members point backwards, followed by a backwards scan
to move everything in place.  This would reduce stack usage quite
significantly and avoid a hard restriction on d_name length.

>    struct dirent64 *dp = (struct dirent64 *) buf;

> +

> +  size_t nb = 0;

> +  off64_t last_offset = -1;

> +

> +  ssize_t r;

> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)

>      {


Sorry, I don't see how the outer loop is exited.  I think we should
remove it because it does not seem necessary.

> +      struct kernel_dirent *skdp, *kdp;

> +      skdp = kdp = kbuf;

> +

> +      while ((char *) kdp < (char *) skdp + r)

> +	{

> +	  const size_t alignment = _Alignof (struct dirent64);

> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)

> +			      & ~(alignment - 1));


I think this is the roundup macro.  If you use that, I think you don't
need the alignment variable.

Is the length really correct, though?  I'd expect it to grow by the
additional size of the d_ino and d_off members.  I think it would be
best recompute it from scratch, using the actual length of d_name.

> +	  if (nb + new_reclen > nbytes)

>  	    {

> +		/* The new entry will overflow the input buffer, rewind to

> +		   last obtained entry and return.  */

> +	       __lseek64 (fd, last_offset, SEEK_SET);


I don't think last_offset is guaranteed to have been set with a proper
offset at this point.  Given that d_name is essentially of unbounded
length, even expanding the first entry can cause failure.

Maybe it's possible to avoid this corner case by limiting the amount of
data being read so that we know that the application-supplied buffer is
always large enough for any possible expansion.  I think the worse-case
growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
should divide the buffer size by 1.6 and use that?

> +	       goto out;

>  	    }

> +	  nb += new_reclen;

>  

> +	  dp->d_ino = kdp->d_ino;

> +	  dp->d_off = last_offset = kdp->d_off;

> +	  dp->d_reclen = new_reclen;

> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);


I think instead of reading through kdp, you should use char *s and
memcpy, to avoid the aliasing violation, as discussed above.  Likewise
for writing to dp.

> +	  memcpy (dp->d_name, kdp->d_name,

> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));


See above, I have concerns about the length.

Thanks,
Florian
Adhemerval Zanella Netto Aug. 30, 2019, 12:52 p.m. UTC | #9
On 30/08/2019 06:52, Florian Weimer wrote:
> Sorry, I missed that despite all the 64s in the patch, this code is also

> used on 32-bit architectures.  The review below should take this new (to

> me) piece of information into account.

> 

> * Adhemerval Zanella:

> 

>> +  static bool getdents64_supportted = true;

>> +  if (atomic_load_relaxed (&getdents64_supportted))

> 

> Our atomics do not support bool, only 4 bytes and 8 bytes (if

> __HAVE_64B_ATOMICS) is defined.  See __atomic_check_size.

> 

> Probably it will work if it compiles, but I haven't checked that.


MIPS in fact does not define USE_ATOMIC_COMPILER_BUILTINS and thus it uses
the __atomic_check_size_ls macro.  In any case, I changed to a int for this 
case.

> 

>>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.

>> +     If the syscall is not available it need to fallback to the non-LFS one.

>> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which

>> +     would make the syscall non async-signal-safe) it uses a limited buffer.

>> +     This is sub-optimal for large NBYTES, however this is a fallback

>> +     mechanism to emulate a syscall that kernel should provide.   */

>>  

>> +  enum { KBUF_SIZE = 1024 };

> 

> The choice of size needs a comment.  I think the largest possible

> practical length of the d_name member are 255 Unicode characters in the

> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from

> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux

> in reality.)


I picked the buffer as an arbitrary value, what about:

  /* The largest possible practical length of the d_name member are 255
     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
     / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
     least one entry.  */
  enum { KBUF_SIZE = 1024 };


> 

>>    struct kernel_dirent

>> +  {

>> +    unsigned long d_ino;

>> +    unsigned long d_off;

>> +    unsigned short int d_reclen;

>> +    char d_name[1];

>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

> 

> I would define kbuf as a char array, and perhaps leave out the d_name

> member in struct kernel_dirent.  You can copy out the struct

> kernel_dirent using memcpy, which GCC should optimize away.


I defined the buffer as 'struct kernel_dirent' to make it easier to align
for the required fields.  It allows simplify the access on the loop to
avoid memcpy calls.

> 

> Ideally, we would perform the conversion in-line, with a forward scan to

> make the d_reclen members point backwards, followed by a backwards scan

> to move everything in place.  This would reduce stack usage quite

> significantly and avoid a hard restriction on d_name length.


I take this fallback code as a best effort due the restrictions we have
(make it async-signal-safe with bounded stack allocation).  Even with a
clever buffer managements we can't remove the d_name length with the
aforementioned restrictions.  The proper solution is indeed using the
syscall.

> 

>>    struct dirent64 *dp = (struct dirent64 *) buf;

>> +

>> +  size_t nb = 0;

>> +  off64_t last_offset = -1;

>> +

>> +  ssize_t r;

>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)

>>      {

> 

> Sorry, I don't see how the outer loop is exited.  I think we should

> remove it because it does not seem necessary.


We still need to handle the cases where NBYTES are larger than the temporary
buffer, because it might require multiple getdents calls.


> 

>> +      struct kernel_dirent *skdp, *kdp;

>> +      skdp = kdp = kbuf;

>> +

>> +      while ((char *) kdp < (char *) skdp + r)

>> +	{

>> +	  const size_t alignment = _Alignof (struct dirent64);

>> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)

>> +			      & ~(alignment - 1));

> 

> I think this is the roundup macro.  If you use that, I think you don't

> need the alignment variable.


I changed to use ALIGN_UP from libc-pointer-arith.h.

> 

> Is the length really correct, though?  I'd expect it to grow by the

> additional size of the d_ino and d_off members.  I think it would be

> best recompute it from scratch, using the actual length of d_name.


It it because you are referencing to an older patch version, checking on
mips64-n32 I adjusted to:

  const size_t size_diff = (offsetof (struct dirent64, d_name)
                           - offsetof (struct kernel_dirent, d_name));
  [...]
               size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
                                        _Alignof (struct dirent64));
  [...]

> 

>> +	  if (nb + new_reclen > nbytes)

>>  	    {

>> +		/* The new entry will overflow the input buffer, rewind to

>> +		   last obtained entry and return.  */

>> +	       __lseek64 (fd, last_offset, SEEK_SET);

> 

> I don't think last_offset is guaranteed to have been set with a proper

> offset at this point.  Given that d_name is essentially of unbounded

> length, even expanding the first entry can cause failure.

> 

> Maybe it's possible to avoid this corner case by limiting the amount of

> data being read so that we know that the application-supplied buffer is

> always large enough for any possible expansion.  I think the worse-case

> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we

> should divide the buffer size by 1.6 and use that?


For this case I really think we just need to return an error to user:

            if (nb + new_reclen > nbytes)
            {   
		/* Entry is too large for the static buffer.  */
		if (last_offset == -1)
		  {
		    __set_errno (EINVAL);
		    return -1;
		  }
                /* The new entry will overflow the input buffer, rewind to
                   last obtained entry and return.  */
               __lseek64 (fd, last_offset, SEEK_SET);
               goto out;
            }

Again I see this fallback code as a best-effort since we are emulating the
syscall with additional restraints.  Most of time glibc tries to play smart
emulating a syscall ended in a lot of headaches...


> 

>> +	       goto out;

>>  	    }

>> +	  nb += new_reclen;

>>  

>> +	  dp->d_ino = kdp->d_ino;

>> +	  dp->d_off = last_offset = kdp->d_off;

>> +	  dp->d_reclen = new_reclen;

>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

> 

> I think instead of reading through kdp, you should use char *s and

> memcpy, to avoid the aliasing violation, as discussed above.  Likewise

> for writing to dp.


I think if we proper setting the buffer alignment there is no need to do it.
Also the problem of using memcpy here is for mips64n32 the size is *not* 
equal for dp and kdp, each would require an extra step as:

   {
     typeof (kdp->d_ino) kino;
     memcpy (&kino, &kdp_d->ino, sizeof (kino));
     typeof (dp->d_ino) dino = kino;
     memcpy (&dp->d_ino, &kino, sizeof (dino));
   }

> 

>> +	  memcpy (dp->d_name, kdp->d_name,

>> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));

> 

> See above, I have concerns about the length.

> 

> Thanks,

> Florian

>
Florian Weimer Sept. 2, 2019, 12:59 p.m. UTC | #10
* Adhemerval Zanella:

>> The choice of size needs a comment.  I think the largest possible

>> practical length of the d_name member are 255 Unicode characters in the

>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from

>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux

>> in reality.)

>

> I picked the buffer as an arbitrary value, what about:

>

>   /* The largest possible practical length of the d_name member are 255

>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus

>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)

>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at

>      least one entry.  */

>   enum { KBUF_SIZE = 1024 };


“holds”

Looks good.

>>>    struct kernel_dirent

>>> +  {

>>> +    unsigned long d_ino;

>>> +    unsigned long d_off;

>>> +    unsigned short int d_reclen;

>>> +    char d_name[1];

>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

>> 

>> I would define kbuf as a char array, and perhaps leave out the d_name

>> member in struct kernel_dirent.  You can copy out the struct

>> kernel_dirent using memcpy, which GCC should optimize away.

>

> I defined the buffer as 'struct kernel_dirent' to make it easier to align

> for the required fields.  It allows simplify the access on the loop to

> avoid memcpy calls.


But the code is invalid C as a result of this.  We do not compile glibc
with -fno-strict-aliasing, after all.

>>>    struct dirent64 *dp = (struct dirent64 *) buf;

>>> +

>>> +  size_t nb = 0;

>>> +  off64_t last_offset = -1;

>>> +

>>> +  ssize_t r;

>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)

>>>      {

>> 

>> Sorry, I don't see how the outer loop is exited.  I think we should

>> remove it because it does not seem necessary.

>

> We still need to handle the cases where NBYTES are larger than the temporary

> buffer, because it might require multiple getdents calls.


Why?  The application (or readdir) will call us again to get more entries.

>> Is the length really correct, though?  I'd expect it to grow by the

>> additional size of the d_ino and d_off members.  I think it would be

>> best recompute it from scratch, using the actual length of d_name.

>

> It it because you are referencing to an older patch version, checking on

> mips64-n32 I adjusted to:

>

>   const size_t size_diff = (offsetof (struct dirent64, d_name)

>                            - offsetof (struct kernel_dirent, d_name));

>   [...]

>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,

>                                         _Alignof (struct dirent64));

>   [...]


Okay, this needs a comment that this is a conservative approximation
(some of size_diff might fit into the existing padding for alignment).

>>> +	  if (nb + new_reclen > nbytes)

>>>  	    {

>>> +		/* The new entry will overflow the input buffer, rewind to

>>> +		   last obtained entry and return.  */

>>> +	       __lseek64 (fd, last_offset, SEEK_SET);

>> 

>> I don't think last_offset is guaranteed to have been set with a proper

>> offset at this point.  Given that d_name is essentially of unbounded

>> length, even expanding the first entry can cause failure.

>> 

>> Maybe it's possible to avoid this corner case by limiting the amount of

>> data being read so that we know that the application-supplied buffer is

>> always large enough for any possible expansion.  I think the worse-case

>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we

>> should divide the buffer size by 1.6 and use that?

>

> For this case I really think we just need to return an error to user:

>

>             if (nb + new_reclen > nbytes)

>             {   

> 		/* Entry is too large for the static buffer.  */


Fixed-size buffer, it's not static. 8-)

> 		if (last_offset == -1)

> 		  {

> 		    __set_errno (EINVAL);

> 		    return -1;

> 		  }

>                 /* The new entry will overflow the input buffer, rewind to

>                    last obtained entry and return.  */

>                __lseek64 (fd, last_offset, SEEK_SET);

>                goto out;

>             }

>

> Again I see this fallback code as a best-effort since we are emulating the

> syscall with additional restraints.  Most of time glibc tries to play smart

> emulating a syscall ended in a lot of headaches...


I don't disagree.

Which error code does the kernel return if no entry can be read at all?
We should mirror that.

>>> +	       goto out;

>>>  	    }

>>> +	  nb += new_reclen;

>>>  

>>> +	  dp->d_ino = kdp->d_ino;

>>> +	  dp->d_off = last_offset = kdp->d_off;

>>> +	  dp->d_reclen = new_reclen;

>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

>> 

>> I think instead of reading through kdp, you should use char *s and

>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise

>> for writing to dp.

>

> I think if we proper setting the buffer alignment there is no need to do it.

> Also the problem of using memcpy here is for mips64n32 the size is *not* 

> equal for dp and kdp, each would require an extra step as:

>

>    {

>      typeof (kdp->d_ino) kino;

>      memcpy (&kino, &kdp_d->ino, sizeof (kino));

>      typeof (dp->d_ino) dino = kino;

>      memcpy (&dp->d_ino, &kino, sizeof (dino));

>    }


I think that's just the price of writing correct C.  It's also what the
kernel does.

I don't even think there's a requirement that the byte buffer passed to
getdents64 has any kind of alignment.

Thanks,
Florian
Adhemerval Zanella Netto Sept. 2, 2019, 5:38 p.m. UTC | #11
On 02/09/2019 09:59, Florian Weimer wrote:
> * Adhemerval Zanella:

> 

>>> The choice of size needs a comment.  I think the largest possible

>>> practical length of the d_name member are 255 Unicode characters in the

>>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from

>>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux

>>> in reality.)

>>

>> I picked the buffer as an arbitrary value, what about:

>>

>>   /* The largest possible practical length of the d_name member are 255

>>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus

>>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)

>>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at

>>      least one entry.  */

>>   enum { KBUF_SIZE = 1024 };

> 

> “holds”

> 

> Looks good.

> 

>>>>    struct kernel_dirent

>>>> +  {

>>>> +    unsigned long d_ino;

>>>> +    unsigned long d_off;

>>>> +    unsigned short int d_reclen;

>>>> +    char d_name[1];

>>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

>>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

>>>

>>> I would define kbuf as a char array, and perhaps leave out the d_name

>>> member in struct kernel_dirent.  You can copy out the struct

>>> kernel_dirent using memcpy, which GCC should optimize away.

>>

>> I defined the buffer as 'struct kernel_dirent' to make it easier to align

>> for the required fields.  It allows simplify the access on the loop to

>> avoid memcpy calls.

> 

> But the code is invalid C as a result of this.  We do not compile glibc

> with -fno-strict-aliasing, after all.


Right, we indeed do some pointer arithmetic to get the kdp value. I change
to use char buffer plus memcpy to obtain the fields.

> 

>>>>    struct dirent64 *dp = (struct dirent64 *) buf;

>>>> +

>>>> +  size_t nb = 0;

>>>> +  off64_t last_offset = -1;

>>>> +

>>>> +  ssize_t r;

>>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)

>>>>      {

>>>

>>> Sorry, I don't see how the outer loop is exited.  I think we should

>>> remove it because it does not seem necessary.

>>

>> We still need to handle the cases where NBYTES are larger than the temporary

>> buffer, because it might require multiple getdents calls.

> 

> Why?  The application (or readdir) will call us again to get more entries.


As a simple optimization to avoid it, but I think we can move to a simplified
version.

> 

>>> Is the length really correct, though?  I'd expect it to grow by the

>>> additional size of the d_ino and d_off members.  I think it would be

>>> best recompute it from scratch, using the actual length of d_name.

>>

>> It it because you are referencing to an older patch version, checking on

>> mips64-n32 I adjusted to:

>>

>>   const size_t size_diff = (offsetof (struct dirent64, d_name)

>>                            - offsetof (struct kernel_dirent, d_name));

>>   [...]

>>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,

>>                                         _Alignof (struct dirent64));

>>   [...]

> 

> Okay, this needs a comment that this is a conservative approximation

> (some of size_diff might fit into the existing padding for alignment).


Ack.

> 

>>>> +	  if (nb + new_reclen > nbytes)

>>>>  	    {

>>>> +		/* The new entry will overflow the input buffer, rewind to

>>>> +		   last obtained entry and return.  */

>>>> +	       __lseek64 (fd, last_offset, SEEK_SET);

>>>

>>> I don't think last_offset is guaranteed to have been set with a proper

>>> offset at this point.  Given that d_name is essentially of unbounded

>>> length, even expanding the first entry can cause failure.

>>>

>>> Maybe it's possible to avoid this corner case by limiting the amount of

>>> data being read so that we know that the application-supplied buffer is

>>> always large enough for any possible expansion.  I think the worse-case

>>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we

>>> should divide the buffer size by 1.6 and use that?

>>

>> For this case I really think we just need to return an error to user:

>>

>>             if (nb + new_reclen > nbytes)

>>             {   

>> 		/* Entry is too large for the static buffer.  */

> 

> Fixed-size buffer, it's not static. 8-)


Ack.

> 

>> 		if (last_offset == -1)

>> 		  {

>> 		    __set_errno (EINVAL);

>> 		    return -1;

>> 		  }

>>                 /* The new entry will overflow the input buffer, rewind to

>>                    last obtained entry and return.  */

>>                __lseek64 (fd, last_offset, SEEK_SET);

>>                goto out;

>>             }

>>

>> Again I see this fallback code as a best-effort since we are emulating the

>> syscall with additional restraints.  Most of time glibc tries to play smart

>> emulating a syscall ended in a lot of headaches...

> 

> I don't disagree.

> 

> Which error code does the kernel return if no entry can be read at all?

> We should mirror that.


It returns -1/EINVAL.

fs/readdir.c
177         if (reclen > buf->count)                                                                    
178                 return -EINVAL;

> 

>>>> +	       goto out;

>>>>  	    }

>>>> +	  nb += new_reclen;

>>>>  

>>>> +	  dp->d_ino = kdp->d_ino;

>>>> +	  dp->d_off = last_offset = kdp->d_off;

>>>> +	  dp->d_reclen = new_reclen;

>>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

>>>

>>> I think instead of reading through kdp, you should use char *s and

>>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise

>>> for writing to dp.

>>

>> I think if we proper setting the buffer alignment there is no need to do it.

>> Also the problem of using memcpy here is for mips64n32 the size is *not* 

>> equal for dp and kdp, each would require an extra step as:

>>

>>    {

>>      typeof (kdp->d_ino) kino;

>>      memcpy (&kino, &kdp_d->ino, sizeof (kino));

>>      typeof (dp->d_ino) dino = kino;

>>      memcpy (&dp->d_ino, &kino, sizeof (dino));

>>    }

> 

> I think that's just the price of writing correct C.  It's also what the

> kernel does.


Ack.

> 

> I don't even think there's a requirement that the byte buffer passed to

> getdents64 has any kind of alignment.

> 

> Thanks,

> Florian

> 


Updated patch below.

--

This patch changes how the fallback getdents64 implementation calls
non-LFS getdents by replacing the scratch_buffer with static buffer
plus a loop on getdents calls.  This avoids the potential malloc
call on scratch_buffer_set_array_size for large input buffer size
at the cost of more getdents syscalls.

It also adds a small optimization for older kernels, where the first
ENOSYS failure for getdents64 disable subsequent calls.

Check the dirent tests on a mips64-linux-gnu with getdents64 code
disabled.

	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
	Add small optimization for older kernel to avoid issuing
	__NR_getdents64 on each call and replace scratch_buffer usage with
	a static allocated buffer.
---
 .../unix/sysv/linux/mips/mips64/getdents64.c  | 133 ++++++++++--------
 1 file changed, 76 insertions(+), 57 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..02e15a0b2e 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,88 +22,108 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
+#include <include/libc-pointer-arith.h>
+
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static int getdents64_supported = true;
+  if (atomic_load_relaxed (&getdents64_supported))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supported, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
+  {
+#if _MIPS_SIM == _ABI64
+    uint64_t d_ino;
+    uint64_t d_off;
+#else
+    uint32_t d_ino;
+    uint32_t d_off;
+#endif
+    unsigned short int d_reclen;
+    char d_name[1];
+  };
+
+  /* The largest possible practical length of the d_name member are 255
+     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
+     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
+     / 776 (mips64n32) bytes total.  Ensure that the minimum size holds at
+     least one entry.  */
+  enum { KBUF_SIZE = 1024 };
+  char kbuf[KBUF_SIZE];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
   const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
+                           - offsetof (struct kernel_dirent, d_name));
 
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
+  struct dirent64 *dp = (struct dirent64 *) buf;
 
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
+  size_t nb = 0;
+  off64_t last_offset = -1;
 
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
+  ssize_t r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size);
+  if (r <= 0)
+    return r;
 
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  struct kernel_dirent *skdp, *kdp;
+  skdp = kdp = (struct kernel_dirent *) kbuf;
 
-  off64_t last_offset = -1;
-  struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+  while ((char *) kdp < (char *) skdp + r)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      /* This is a conservative approximation, since some of size_diff might
+	 fit into the existing padding for alignment.  */
+      size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
+				    _Alignof (struct dirent64));
+      if (nb + new_reclen > nbytes)
+	{
+	  /* Entry is too large for the fixed-size buffer.  */
+	  if (last_offset == -1)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+	      __set_errno (EINVAL);
+	      return -1;
 	    }
 
-	  break;
+	  /* The new entry will overflow the input buffer, rewind to last
+	     obtained entry and return.  */
+	  __lseek64 (fd, last_offset, SEEK_SET);
+	  return (char *) dp - (char *) buf;
 	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
+      nb += new_reclen;
+
+#define copy_field(dst, src)			\
+  ({						\
+     typeof (src) _src;				\
+     memcpy (&_src, &(src), sizeof (src));	\
+     typeof (dst) _dst = _src;			\
+     memcpy (&(dst), &_dst, sizeof (dst));	\
+  })
+
+      copy_field (dp->d_ino, kdp->d_ino);
+      copy_field (dp->d_off, kdp->d_off);
+      copy_field (last_offset, kdp->d_off);
+      copy_field (dp->d_reclen, new_reclen);
       dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
       memcpy (dp->d_name, kdp->d_name,
 	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
@@ -112,8 +132,7 @@ __getdents64 (int fd, void *buf0, size_t nbytes)
       kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)
Adhemerval Zanella Netto Oct. 7, 2019, 5:49 p.m. UTC | #12
Ping.

On 02/09/2019 14:38, Adhemerval Zanella wrote:
> 

> 

> On 02/09/2019 09:59, Florian Weimer wrote:

>> * Adhemerval Zanella:

>>

>>>> The choice of size needs a comment.  I think the largest possible

>>>> practical length of the d_name member are 255 Unicode characters in the

>>>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from

>>>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux

>>>> in reality.)

>>>

>>> I picked the buffer as an arbitrary value, what about:

>>>

>>>   /* The largest possible practical length of the d_name member are 255

>>>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus

>>>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)

>>>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at

>>>      least one entry.  */

>>>   enum { KBUF_SIZE = 1024 };

>>

>> “holds”

>>

>> Looks good.

>>

>>>>>    struct kernel_dirent

>>>>> +  {

>>>>> +    unsigned long d_ino;

>>>>> +    unsigned long d_off;

>>>>> +    unsigned short int d_reclen;

>>>>> +    char d_name[1];

>>>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

>>>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

>>>>

>>>> I would define kbuf as a char array, and perhaps leave out the d_name

>>>> member in struct kernel_dirent.  You can copy out the struct

>>>> kernel_dirent using memcpy, which GCC should optimize away.

>>>

>>> I defined the buffer as 'struct kernel_dirent' to make it easier to align

>>> for the required fields.  It allows simplify the access on the loop to

>>> avoid memcpy calls.

>>

>> But the code is invalid C as a result of this.  We do not compile glibc

>> with -fno-strict-aliasing, after all.

> 

> Right, we indeed do some pointer arithmetic to get the kdp value. I change

> to use char buffer plus memcpy to obtain the fields.

> 

>>

>>>>>    struct dirent64 *dp = (struct dirent64 *) buf;

>>>>> +

>>>>> +  size_t nb = 0;

>>>>> +  off64_t last_offset = -1;

>>>>> +

>>>>> +  ssize_t r;

>>>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)

>>>>>      {

>>>>

>>>> Sorry, I don't see how the outer loop is exited.  I think we should

>>>> remove it because it does not seem necessary.

>>>

>>> We still need to handle the cases where NBYTES are larger than the temporary

>>> buffer, because it might require multiple getdents calls.

>>

>> Why?  The application (or readdir) will call us again to get more entries.

> 

> As a simple optimization to avoid it, but I think we can move to a simplified

> version.

> 

>>

>>>> Is the length really correct, though?  I'd expect it to grow by the

>>>> additional size of the d_ino and d_off members.  I think it would be

>>>> best recompute it from scratch, using the actual length of d_name.

>>>

>>> It it because you are referencing to an older patch version, checking on

>>> mips64-n32 I adjusted to:

>>>

>>>   const size_t size_diff = (offsetof (struct dirent64, d_name)

>>>                            - offsetof (struct kernel_dirent, d_name));

>>>   [...]

>>>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,

>>>                                         _Alignof (struct dirent64));

>>>   [...]

>>

>> Okay, this needs a comment that this is a conservative approximation

>> (some of size_diff might fit into the existing padding for alignment).

> 

> Ack.

> 

>>

>>>>> +	  if (nb + new_reclen > nbytes)

>>>>>  	    {

>>>>> +		/* The new entry will overflow the input buffer, rewind to

>>>>> +		   last obtained entry and return.  */

>>>>> +	       __lseek64 (fd, last_offset, SEEK_SET);

>>>>

>>>> I don't think last_offset is guaranteed to have been set with a proper

>>>> offset at this point.  Given that d_name is essentially of unbounded

>>>> length, even expanding the first entry can cause failure.

>>>>

>>>> Maybe it's possible to avoid this corner case by limiting the amount of

>>>> data being read so that we know that the application-supplied buffer is

>>>> always large enough for any possible expansion.  I think the worse-case

>>>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we

>>>> should divide the buffer size by 1.6 and use that?

>>>

>>> For this case I really think we just need to return an error to user:

>>>

>>>             if (nb + new_reclen > nbytes)

>>>             {   

>>> 		/* Entry is too large for the static buffer.  */

>>

>> Fixed-size buffer, it's not static. 8-)

> 

> Ack.

> 

>>

>>> 		if (last_offset == -1)

>>> 		  {

>>> 		    __set_errno (EINVAL);

>>> 		    return -1;

>>> 		  }

>>>                 /* The new entry will overflow the input buffer, rewind to

>>>                    last obtained entry and return.  */

>>>                __lseek64 (fd, last_offset, SEEK_SET);

>>>                goto out;

>>>             }

>>>

>>> Again I see this fallback code as a best-effort since we are emulating the

>>> syscall with additional restraints.  Most of time glibc tries to play smart

>>> emulating a syscall ended in a lot of headaches...

>>

>> I don't disagree.

>>

>> Which error code does the kernel return if no entry can be read at all?

>> We should mirror that.

> 

> It returns -1/EINVAL.

> 

> fs/readdir.c

> 177         if (reclen > buf->count)                                                                    

> 178                 return -EINVAL;

> 

>>

>>>>> +	       goto out;

>>>>>  	    }

>>>>> +	  nb += new_reclen;

>>>>>  

>>>>> +	  dp->d_ino = kdp->d_ino;

>>>>> +	  dp->d_off = last_offset = kdp->d_off;

>>>>> +	  dp->d_reclen = new_reclen;

>>>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

>>>>

>>>> I think instead of reading through kdp, you should use char *s and

>>>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise

>>>> for writing to dp.

>>>

>>> I think if we proper setting the buffer alignment there is no need to do it.

>>> Also the problem of using memcpy here is for mips64n32 the size is *not* 

>>> equal for dp and kdp, each would require an extra step as:

>>>

>>>    {

>>>      typeof (kdp->d_ino) kino;

>>>      memcpy (&kino, &kdp_d->ino, sizeof (kino));

>>>      typeof (dp->d_ino) dino = kino;

>>>      memcpy (&dp->d_ino, &kino, sizeof (dino));

>>>    }

>>

>> I think that's just the price of writing correct C.  It's also what the

>> kernel does.

> 

> Ack.

> 

>>

>> I don't even think there's a requirement that the byte buffer passed to

>> getdents64 has any kind of alignment.

>>

>> Thanks,

>> Florian

>>

> 

> Updated patch below.

> 

> --

> 

> This patch changes how the fallback getdents64 implementation calls

> non-LFS getdents by replacing the scratch_buffer with static buffer

> plus a loop on getdents calls.  This avoids the potential malloc

> call on scratch_buffer_set_array_size for large input buffer size

> at the cost of more getdents syscalls.

> 

> It also adds a small optimization for older kernels, where the first

> ENOSYS failure for getdents64 disable subsequent calls.

> 

> Check the dirent tests on a mips64-linux-gnu with getdents64 code

> disabled.

> 

> 	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):

> 	Add small optimization for older kernel to avoid issuing

> 	__NR_getdents64 on each call and replace scratch_buffer usage with

> 	a static allocated buffer.

> ---

>  .../unix/sysv/linux/mips/mips64/getdents64.c  | 133 ++++++++++--------

>  1 file changed, 76 insertions(+), 57 deletions(-)

> 

> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c

> index 8bf3abb0e0..02e15a0b2e 100644

> --- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c

> +++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c

> @@ -22,88 +22,108 @@

>  #include <assert.h>

>  #include <sys/param.h>

>  #include <unistd.h>

> -#include <scratch_buffer.h>

>  #include <limits.h>

>  

> +#include <include/libc-pointer-arith.h>

> +

>  ssize_t

> -__getdents64 (int fd, void *buf0, size_t nbytes)

> +__getdents64 (int fd, void *buf, size_t nbytes)

>  {

> -  char *buf = buf0;

> -

>    /* The system call takes an unsigned int argument, and some length

>       checks in the kernel use an int type.  */

>    if (nbytes > INT_MAX)

>      nbytes = INT_MAX;

>  

>  #ifdef __NR_getdents64

> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);

> -  if (ret != -1)

> -    return ret;

> +  static int getdents64_supported = true;

> +  if (atomic_load_relaxed (&getdents64_supported))

> +    {

> +      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);

> +      if (ret >= 0 || errno != ENOSYS)

> +	return ret;

> +

> +      atomic_store_relaxed (&getdents64_supported, false);

> +    }

>  #endif

>  

>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.

> -     If syscall is not available it need to fallback to non-LFS one.  */

> +     If the syscall is not available it need to fallback to the non-LFS one.

> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which

> +     would make the syscall non async-signal-safe) it uses a limited buffer.

> +     This is sub-optimal for large NBYTES, however this is a fallback

> +     mechanism to emulate a syscall that kernel should provide.   */

>  

>    struct kernel_dirent

> -    {

> -      unsigned long d_ino;

> -      unsigned long d_off;

> -      unsigned short int d_reclen;

> -      char d_name[256];

> -    };

> +  {

> +#if _MIPS_SIM == _ABI64

> +    uint64_t d_ino;

> +    uint64_t d_off;

> +#else

> +    uint32_t d_ino;

> +    uint32_t d_off;

> +#endif

> +    unsigned short int d_reclen;

> +    char d_name[1];

> +  };

> +

> +  /* The largest possible practical length of the d_name member are 255

> +     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus

> +     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)

> +     / 776 (mips64n32) bytes total.  Ensure that the minimum size holds at

> +     least one entry.  */

> +  enum { KBUF_SIZE = 1024 };

> +  char kbuf[KBUF_SIZE];

> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

>  

>    const size_t size_diff = (offsetof (struct dirent64, d_name)

> -			   - offsetof (struct kernel_dirent, d_name));

> +                           - offsetof (struct kernel_dirent, d_name));

>  

> -  size_t red_nbytes = MIN (nbytes

> -			   - ((nbytes / (offsetof (struct dirent64, d_name)

> -					 + 14)) * size_diff),

> -			   nbytes - size_diff);

> +  struct dirent64 *dp = (struct dirent64 *) buf;

>  

> -  struct scratch_buffer tmpbuf;

> -  scratch_buffer_init (&tmpbuf);

> -  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))

> -    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);

> +  size_t nb = 0;

> +  off64_t last_offset = -1;

>  

> -  struct kernel_dirent *skdp, *kdp;

> -  skdp = kdp = tmpbuf.data;

> +  ssize_t r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size);

> +  if (r <= 0)

> +    return r;

>  

> -  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);

> -  if (retval == -1)

> -    {

> -      scratch_buffer_free (&tmpbuf);

> -      return -1;

> -    }

> +  struct kernel_dirent *skdp, *kdp;

> +  skdp = kdp = (struct kernel_dirent *) kbuf;

>  

> -  off64_t last_offset = -1;

> -  struct dirent64 *dp = (struct dirent64 *) buf;

> -  while ((char *) kdp < (char *) skdp + retval)

> +  while ((char *) kdp < (char *) skdp + r)

>      {

> -      const size_t alignment = _Alignof (struct dirent64);

> -      /* Since kdp->d_reclen is already aligned for the kernel structure

> -	 this may compute a value that is bigger than necessary.  */

> -      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)

> -			   & ~(alignment - 1));

> -      if ((char *) dp + new_reclen > buf + nbytes)

> -        {

> -	  /* Our heuristic failed.  We read too many entries.  Reset

> -	     the stream.  */

> -	  assert (last_offset != -1);

> -	  __lseek64 (fd, last_offset, SEEK_SET);

> -

> -	  if ((char *) dp == buf)

> +      /* This is a conservative approximation, since some of size_diff might

> +	 fit into the existing padding for alignment.  */

> +      size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,

> +				    _Alignof (struct dirent64));

> +      if (nb + new_reclen > nbytes)

> +	{

> +	  /* Entry is too large for the fixed-size buffer.  */

> +	  if (last_offset == -1)

>  	    {

> -	      scratch_buffer_free (&tmpbuf);

> -	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);

> +	      __set_errno (EINVAL);

> +	      return -1;

>  	    }

>  

> -	  break;

> +	  /* The new entry will overflow the input buffer, rewind to last

> +	     obtained entry and return.  */

> +	  __lseek64 (fd, last_offset, SEEK_SET);

> +	  return (char *) dp - (char *) buf;

>  	}

> -

> -      last_offset = kdp->d_off;

> -      dp->d_ino = kdp->d_ino;

> -      dp->d_off = kdp->d_off;

> -      dp->d_reclen = new_reclen;

> +      nb += new_reclen;

> +

> +#define copy_field(dst, src)			\

> +  ({						\

> +     typeof (src) _src;				\

> +     memcpy (&_src, &(src), sizeof (src));	\

> +     typeof (dst) _dst = _src;			\

> +     memcpy (&(dst), &_dst, sizeof (dst));	\

> +  })

> +

> +      copy_field (dp->d_ino, kdp->d_ino);

> +      copy_field (dp->d_off, kdp->d_off);

> +      copy_field (last_offset, kdp->d_off);

> +      copy_field (dp->d_reclen, new_reclen);

>        dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

>        memcpy (dp->d_name, kdp->d_name,

>  	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));

> @@ -112,8 +132,7 @@ __getdents64 (int fd, void *buf0, size_t nbytes)

>        kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);

>      }

>  

> -  scratch_buffer_free (&tmpbuf);

> -  return (char *) dp - buf;

> +  return (char *) dp - (char *) buf;

>  }

>  libc_hidden_def (__getdents64)

>  weak_alias (__getdents64, getdents64)

>
Florian Weimer Oct. 7, 2019, 6:29 p.m. UTC | #13
* Adhemerval Zanella:

> +#define copy_field(dst, src)			\

> +  ({						\

> +     typeof (src) _src;				\

> +     memcpy (&_src, &(src), sizeof (src));	\

> +     typeof (dst) _dst = _src;			\

> +     memcpy (&(dst), &_dst, sizeof (dst));	\

> +  })

> +

> +      copy_field (dp->d_ino, kdp->d_ino);

> +      copy_field (dp->d_off, kdp->d_off);

> +      copy_field (last_offset, kdp->d_off);

> +      copy_field (dp->d_reclen, new_reclen);

>        dp->d_type = *((char *) kdp + kdp->d_reclen - 1);


I believe this still asserts the dynamic type of *dp, which is not what
we want.  The truly portable way probably involves using offsetof and
not -> dereferencing. 8-(

Considering that, I would probably drop copy_field, compile the file
with -fno-strict-aliasing, and add a comment to the (now plain
assignments) that this is okay due to -fno-strict-aliasing.

But this is really up to you, I do not want to discuss this patch to
death.

Thanks,
Florian
Adhemerval Zanella Netto Oct. 8, 2019, 5:37 p.m. UTC | #14
On 07/10/2019 15:29, Florian Weimer wrote:
> * Adhemerval Zanella:

> 

>> +#define copy_field(dst, src)			\

>> +  ({						\

>> +     typeof (src) _src;				\

>> +     memcpy (&_src, &(src), sizeof (src));	\

>> +     typeof (dst) _dst = _src;			\

>> +     memcpy (&(dst), &_dst, sizeof (dst));	\

>> +  })

>> +

>> +      copy_field (dp->d_ino, kdp->d_ino);

>> +      copy_field (dp->d_off, kdp->d_off);

>> +      copy_field (last_offset, kdp->d_off);

>> +      copy_field (dp->d_reclen, new_reclen);

>>        dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

> 

> I believe this still asserts the dynamic type of *dp, which is not what

> we want.  The truly portable way probably involves using offsetof and

> not -> dereferencing. 8-(

> 

> Considering that, I would probably drop copy_field, compile the file

> with -fno-strict-aliasing, and add a comment to the (now plain

> assignments) that this is okay due to -fno-strict-aliasing.

> 

> But this is really up to you, I do not want to discuss this patch to

> death.

> 


I chatted with Rich Felker yesterday and we couldn't be sure that
'&((T*)buf)->member' is indeed UB.  In any case I changed to a portable
way now and I think it should be ok to push.

--

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index a8c65cccbf..905239cad9 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,88 +22,113 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
+#include <include/libc-pointer-arith.h>
+
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static int getdents64_supported = true;
+  if (atomic_load_relaxed (&getdents64_supported))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supported, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
+  {
+#if _MIPS_SIM == _ABI64
+    uint64_t d_ino;
+    uint64_t d_off;
+#else
+    uint32_t d_ino;
+    uint32_t d_off;
+#endif
+    unsigned short int d_reclen;
+    char d_name[1];
+  };
+
+  /* The largest possible practical length of the d_name member are 255
+     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
+     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
+     / 776 (mips64n32) bytes total.  Ensure that the minimum size holds at
+     least one entry.  */
+  enum { KBUF_SIZE = 1024 };
+  char kbuf[KBUF_SIZE];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
   const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
+                           - offsetof (struct kernel_dirent, d_name));
 
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
+  struct dirent64 *dp = (struct dirent64 *) buf;
 
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
+  size_t nb = 0;
+  off64_t last_offset = -1;
 
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
+  ssize_t r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size);
+  if (r <= 0)
+    return r;
 
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  struct kernel_dirent *skdp, *kdp;
+  skdp = kdp = (struct kernel_dirent *) kbuf;
 
-  off64_t last_offset = -1;
-  struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+  while ((char *) kdp < (char *) skdp + r)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      /* This is a conservative approximation, since some of size_diff might
+	 fit into the existing padding for alignment.  */
+      size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
+				    _Alignof (struct dirent64));
+      if (nb + new_reclen > nbytes)
+	{
+	  /* Entry is too large for the fixed-size buffer.  */
+	  if (last_offset == -1)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+	      __set_errno (EINVAL);
+	      return -1;
 	    }
 
-	  break;
+	  /* The new entry will overflow the input buffer, rewind to last
+	     obtained entry and return.  */
+	  __lseek64 (fd, last_offset, SEEK_SET);
+	  return (char *) dp - (char *) buf;
 	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
+      nb += new_reclen;
+
+#define DP_MEMBER(src, type, member)			     \
+    (__typeof__((type){0}.member) *)			     \
+      memcpy (&((__typeof__((type){0}.member)){0}),          \
+	      ((char *)(src) + offsetof (type, member)),     \
+	      sizeof ((type){0}.member))
+
+      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),
+	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
+	      sizeof ((struct dirent64){0}.d_ino));
+      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),
+	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
+	      sizeof ((struct dirent64){0}.d_ino));
+      memcpy (&last_offset,
+	      DP_MEMBER (kdp, struct kernel_dirent, d_off),
+	      sizeof (last_offset));
+      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,
+	      sizeof ((struct dirent64){0}.d_reclen));
       dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
       memcpy (dp->d_name, kdp->d_name,
 	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
@@ -112,8 +137,7 @@ __getdents64 (int fd, void *buf0, size_t nbytes)
       kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)
Florian Weimer Oct. 8, 2019, 6:52 p.m. UTC | #15
* Adhemerval Zanella:

> +#define DP_MEMBER(src, type, member)			     \

> +    (__typeof__((type){0}.member) *)			     \

> +      memcpy (&((__typeof__((type){0}.member)){0}),          \

> +	      ((char *)(src) + offsetof (type, member)),     \

> +	      sizeof ((type){0}.member))


Please add a comment that this is used to avoid an aliasing violation.

> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),

> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

> +	      sizeof ((struct dirent64){0}.d_ino));

> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),

> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

> +	      sizeof ((struct dirent64){0}.d_ino));

> +      memcpy (&last_offset,

> +	      DP_MEMBER (kdp, struct kernel_dirent, d_off),

> +	      sizeof (last_offset));


I think you should be able to use:

   last_offset = *DP_MEMBER (kdp, struct kernel_dirent, d_off);

last_offset has the correct type.

> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,

> +	      sizeof ((struct dirent64){0}.d_reclen));


That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a
temporary object, so the outer memcpy is dead.

Thanks,
Florian
Adhemerval Zanella Netto Oct. 8, 2019, 7:52 p.m. UTC | #16
On 08/10/2019 15:52, Florian Weimer wrote:
> * Adhemerval Zanella:

> 

>> +#define DP_MEMBER(src, type, member)			     \

>> +    (__typeof__((type){0}.member) *)			     \

>> +      memcpy (&((__typeof__((type){0}.member)){0}),          \

>> +	      ((char *)(src) + offsetof (type, member)),     \

>> +	      sizeof ((type){0}.member))

> 

> Please add a comment that this is used to avoid an aliasing violation.


Ack.

> 

>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),

>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

>> +	      sizeof ((struct dirent64){0}.d_ino));

>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),

>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

>> +	      sizeof ((struct dirent64){0}.d_ino));

>> +      memcpy (&last_offset,

>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_off),

>> +	      sizeof (last_offset));

> 

> I think you should be able to use:

> 

>    last_offset = *DP_MEMBER (kdp, struct kernel_dirent, d_off);

> 

> last_offset has the correct type.


Ack.

> 

>> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,

>> +	      sizeof ((struct dirent64){0}.d_reclen));

> 

> That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a

> temporary object, so the outer memcpy is dead.


Sigh, indeed. I changed to:

   memcpy (((char *)(dp) + offsetof (struct dirent64, d_reclen)),
           &new_reclen, sizeof ((struct dirent64){0}.d_reclen));

> 

> Thanks,

> Florian

> 


I hope I got this right this time...
Florian Weimer Oct. 8, 2019, 7:59 p.m. UTC | #17
* Adhemerval Zanella:

>>> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,

>>> +	      sizeof ((struct dirent64){0}.d_reclen));

>> 

>> That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a

>> temporary object, so the outer memcpy is dead.

>

> Sigh, indeed. I changed to:

>

>    memcpy (((char *)(dp) + offsetof (struct dirent64, d_reclen)),

>            &new_reclen, sizeof ((struct dirent64){0}.d_reclen));


sizeof ((struct dirent64){0}.d_reclen) could just be
sizeof (new_reclen).  After all, this only works if they are the same.

I guess -fno-strict-aliasing looks more attractive now. 8-/

You probably should write ((char *) dp) instead of (char *)(dp) if you
want to make the operator precedence explicit, or at least drop the
parentheses around dp.  (I think the cast binds tighter than the +,
but I can't really remember.  I tend to write the paranetheses.)
Adhemerval Zanella Netto Oct. 9, 2019, 1:02 p.m. UTC | #18
On 08/10/2019 16:59, Florian Weimer wrote:
> * Adhemerval Zanella:

> 

>>>> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,

>>>> +	      sizeof ((struct dirent64){0}.d_reclen));

>>>

>>> That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a

>>> temporary object, so the outer memcpy is dead.

>>

>> Sigh, indeed. I changed to:

>>

>>    memcpy (((char *)(dp) + offsetof (struct dirent64, d_reclen)),

>>            &new_reclen, sizeof ((struct dirent64){0}.d_reclen));

> 

> sizeof ((struct dirent64){0}.d_reclen) could just be

> sizeof (new_reclen).  After all, this only works if they are the same.


Ack.

> 

> I guess -fno-strict-aliasing looks more attractive now. 8-/

> 

> You probably should write ((char *) dp) instead of (char *)(dp) if you

> want to make the operator precedence explicit, or at least drop the

> parentheses around dp.  (I think the cast binds tighter than the +,

> but I can't really remember.  I tend to write the paranetheses.)

> 


Ack.
Joseph Myers Nov. 2, 2020, 7:51 p.m. UTC | #19
On Tue, 8 Oct 2019, Adhemerval Zanella wrote:

> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),

> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

> +	      sizeof ((struct dirent64){0}.d_ino));

> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),

> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

> +	      sizeof ((struct dirent64){0}.d_ino));


(This is slightly different from the version of the code that ended up 
getting committed.)

GCC mainline now gives a rather cryptic error about this code:

../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c: In function '__getdents64':
../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:121:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]
  121 |       memcpy (((char *) dp + offsetof (struct dirent64, d_ino)),
      |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  122 |               KDP_MEMBER (kdp, d_ino), sizeof ((struct dirent64){0}.d_ino));
      |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:123:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]
  123 |       memcpy (((char *) dp + offsetof (struct dirent64, d_off)),
      |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  124 |               KDP_MEMBER (kdp, d_off), sizeof ((struct dirent64){0}.d_off));
      |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

I think what this error is pointing out is that the field in 
kernel_dirent, for non-n64, is 32-bit, while this is using memcpy to copy 
64 bits from it into the glibc dirent64, which obviously doesn't work.

-- 
Joseph S. Myers
joseph@codesourcery.com
Adhemerval Zanella via Libc-alpha Nov. 2, 2020, 10:10 p.m. UTC | #20
On 02/11/2020 16:51, Joseph Myers wrote:
> On Tue, 8 Oct 2019, Adhemerval Zanella wrote:

> 

>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),

>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

>> +	      sizeof ((struct dirent64){0}.d_ino));

>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),

>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),

>> +	      sizeof ((struct dirent64){0}.d_ino));

> 

> (This is slightly different from the version of the code that ended up 

> getting committed.)

> 

> GCC mainline now gives a rather cryptic error about this code:

> 

> ../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c: In function '__getdents64':

> ../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:121:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]

>   121 |       memcpy (((char *) dp + offsetof (struct dirent64, d_ino)),

>       |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

>   122 |               KDP_MEMBER (kdp, d_ino), sizeof ((struct dirent64){0}.d_ino));

>       |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

> ../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:123:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]

>   123 |       memcpy (((char *) dp + offsetof (struct dirent64, d_off)),

>       |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

>   124 |               KDP_MEMBER (kdp, d_off), sizeof ((struct dirent64){0}.d_off));

>       |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

> 

> I think what this error is pointing out is that the field in 

> kernel_dirent, for non-n64, is 32-bit, while this is using memcpy to copy 

> 64 bits from it into the glibc dirent64, which obviously doesn't work.

> 


I was trying to be too clever to avoid a temporary variable to handle 
mips64n32.  I think the below should handle the issue raised by GCC11,
I will just check some on mips64 machine from gcc farm before send the
fix.

---

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index d18a5297dc..2ea1369ef4 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -90,16 +90,29 @@ __getdents64 (int fd, void *buf, size_t nbytes)
 
   while ((char *) kdp < (char *) skdp + r)
     {
-      /* This macro is used to avoid aliasing violation.  */
-#define KDP_MEMBER(src, member)			     			\
-    (__typeof__((struct kernel_dirent){0}.member) *)			\
-      memcpy (&((__typeof__((struct kernel_dirent){0}.member)){0}),	\
-	      ((char *)(src) + offsetof (struct kernel_dirent, member)),\
-	      sizeof ((struct kernel_dirent){0}.member))
+#define KDP_MEMBER(src, member)						     \
+      ({								     \
+	__typeof ((struct kernel_dirent){0}.member) kdp_tmp;		     \
+	memcpy (&kdp_tmp,						     \
+		((char *)(src) + offsetof (struct kernel_dirent, member)),   \
+		sizeof (kdp_tmp));					     \
+	kdp_tmp;							     \
+      })
+
+      /* Copy the MEMBER from SRC kernel_dirent to DST dirent64.  It handles
+	 the different size of d_off/d_ino for mips64-n32 by using temporary
+	 variables.  */
+#define COPY_MEMBER(src, dst, member)					     \
+      ({								     \
+	__typeof ((struct dirent64){0}.member) dp_tmp			     \
+	  = KDP_MEMBER (src, member);					     \
+	memcpy ((char *) dp + offsetof (struct dirent64, d_off),	     \
+		&dp_tmp, sizeof (dp_tmp));				     \
+      })
 
       /* This is a conservative approximation, since some of size_diff might
 	 fit into the existing padding for alignment.  */
-      unsigned short int k_reclen = *KDP_MEMBER (kdp, d_reclen);
+      unsigned short int k_reclen = KDP_MEMBER (kdp, d_reclen);
       unsigned short int new_reclen = ALIGN_UP (k_reclen + size_diff,
 						_Alignof (struct dirent64));
       if (nb + new_reclen > nbytes)
@@ -118,11 +131,10 @@ __getdents64 (int fd, void *buf, size_t nbytes)
 	}
       nb += new_reclen;
 
-      memcpy (((char *) dp + offsetof (struct dirent64, d_ino)),
-	      KDP_MEMBER (kdp, d_ino), sizeof ((struct dirent64){0}.d_ino));
-      memcpy (((char *) dp + offsetof (struct dirent64, d_off)),
-	      KDP_MEMBER (kdp, d_off), sizeof ((struct dirent64){0}.d_off));
-      last_offset = *KDP_MEMBER (kdp, d_off);
+      COPY_MEMBER (dp, kdp, d_off);
+      COPY_MEMBER (dp, kdp, d_ino);
+
+      last_offset = KDP_MEMBER (kdp, d_off);
       memcpy (((char *) dp + offsetof (struct dirent64, d_reclen)),
 	      &new_reclen, sizeof (new_reclen));
       dp->d_type = *((char *) kdp + k_reclen - 1);
Adhemerval Zanella via Libc-alpha Nov. 3, 2020, 10:27 a.m. UTC | #21
* Adhemerval Zanella:

> I was trying to be too clever to avoid a temporary variable to handle 

> mips64n32.  I think the below should handle the issue raised by GCC11,

> I will just check some on mips64 machine from gcc farm before send the

> fix.


I think at this point it might be clearer two have two new structs with
the first few fields of the actual structs, use memcpy on the whole
structs, and a field-by-field copy between the structs to perform the
type adjustment.

It's not that we expect the layout of the structs involved to change.

thanks,
Florian
-- 
Red Hat GmbH, https://de.redhat.com/ , Registered seat: Grasbrunn,
Commercial register: Amtsgericht Muenchen, HRB 153243,
Managing Directors: Charles Cachera, Brian Klemm, Laurie Krebs, Michael O'Neill
diff mbox series

Patch

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..3b5afd9324 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,98 +22,84 @@ 
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static bool getdents64_supportted = true;
+  if (atomic_load_relaxed (&getdents64_supportted))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supportted, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
+  enum { KBUF_SIZE = 1024 };
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
-
-  const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
-
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
-
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
-
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
-
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  {
+    unsigned long d_ino;
+    unsigned long d_off;
+    unsigned short int d_reclen;
+    char d_name[1];
+  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
-  off64_t last_offset = -1;
   struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+
+  size_t nb = 0;
+  off64_t last_offset = -1;
+
+  ssize_t r;
+  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      struct kernel_dirent *skdp, *kdp;
+      skdp = kdp = kbuf;
+
+      while ((char *) kdp < (char *) skdp + r)
+	{
+	  const size_t alignment = _Alignof (struct dirent64);
+	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
+			      & ~(alignment - 1));
+	  if (nb + new_reclen > nbytes)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+		/* The new entry will overflow the input buffer, rewind to
+		   last obtained entry and return.  */
+	       __lseek64 (fd, last_offset, SEEK_SET);
+	       goto out;
 	    }
+	  nb += new_reclen;
 
-	  break;
-	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
-      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
-      memcpy (dp->d_name, kdp->d_name,
-	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
+	  dp->d_ino = kdp->d_ino;
+	  dp->d_off = last_offset = kdp->d_off;
+	  dp->d_reclen = new_reclen;
+	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
+	  memcpy (dp->d_name, kdp->d_name,
+		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
 
-      dp = (struct dirent64 *) ((char *) dp + new_reclen);
-      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
+	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	}
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+out:
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)