diff mbox series

fs: apply umask if POSIX ACL support is disabled

Message ID 20230919081900.1096840-1-max.kellermann@ionos.com
State New
Headers show
Series fs: apply umask if POSIX ACL support is disabled | expand

Commit Message

Max Kellermann Sept. 19, 2023, 8:18 a.m. UTC
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
---
 fs/ceph/super.h           | 1 +
 fs/ext2/acl.h             | 1 +
 fs/jfs/jfs_acl.h          | 1 +
 include/linux/posix_acl.h | 1 +
 4 files changed, 4 insertions(+)

Comments

Xiubo Li Sept. 21, 2023, 12:51 a.m. UTC | #1
On 9/19/23 16:18, Max Kellermann wrote:
> Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
> ---
>   fs/ceph/super.h           | 1 +
>   fs/ext2/acl.h             | 1 +
>   fs/jfs/jfs_acl.h          | 1 +
>   include/linux/posix_acl.h | 1 +
>   4 files changed, 4 insertions(+)
>
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 51c7f2b14f6f..e7e2f264acf4 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1194,6 +1194,7 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
>   static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
>   				     struct ceph_acl_sec_ctx *as_ctx)
>   {
> +	*mode &= ~current_umask();
>   	return 0;
>   }

This LGTM.

Shouldn't we also do this in 'ceph_pre_init_acls()' when we couldn't get 
'acl' from 'posix_acl_create()' ?

Thanks!

- Xiubo


>   static inline void ceph_init_inode_acls(struct inode *inode,
> diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
> index 4a8443a2b8ec..694af789c614 100644
> --- a/fs/ext2/acl.h
> +++ b/fs/ext2/acl.h
> @@ -67,6 +67,7 @@ extern int ext2_init_acl (struct inode *, struct inode *);
>   
>   static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
>   {
> +	inode->i_mode &= ~current_umask();
>   	return 0;
>   }
>   #endif
> diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
> index f892e54d0fcd..10791e97a46f 100644
> --- a/fs/jfs/jfs_acl.h
> +++ b/fs/jfs/jfs_acl.h
> @@ -17,6 +17,7 @@ int jfs_init_acl(tid_t, struct inode *, struct inode *);
>   static inline int jfs_init_acl(tid_t tid, struct inode *inode,
>   			       struct inode *dir)
>   {
> +	inode->i_mode &= ~current_umask();
>   	return 0;
>   }
>   
> diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
> index 0e65b3d634d9..54bc9b1061ca 100644
> --- a/include/linux/posix_acl.h
> +++ b/include/linux/posix_acl.h
> @@ -128,6 +128,7 @@ static inline void cache_no_acl(struct inode *inode)
>   static inline int posix_acl_create(struct inode *inode, umode_t *mode,
>   		struct posix_acl **default_acl, struct posix_acl **acl)
>   {
> +	*mode &= ~current_umask();
>   	*default_acl = *acl = NULL;
>   	return 0;
>   }
Dave Kleikamp Oct. 3, 2023, 3:32 p.m. UTC | #2
I think this is sane, but the patch needs a description of why this is 
necessary for these specific file systems.

Thanks,
Shaggy

On 9/19/23 3:18AM, Max Kellermann wrote:
> Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
> ---
>   fs/ceph/super.h           | 1 +
>   fs/ext2/acl.h             | 1 +
>   fs/jfs/jfs_acl.h          | 1 +
>   include/linux/posix_acl.h | 1 +
>   4 files changed, 4 insertions(+)
> 
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 51c7f2b14f6f..e7e2f264acf4 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1194,6 +1194,7 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
>   static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
>   				     struct ceph_acl_sec_ctx *as_ctx)
>   {
> +	*mode &= ~current_umask();
>   	return 0;
>   }
>   static inline void ceph_init_inode_acls(struct inode *inode,
> diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
> index 4a8443a2b8ec..694af789c614 100644
> --- a/fs/ext2/acl.h
> +++ b/fs/ext2/acl.h
> @@ -67,6 +67,7 @@ extern int ext2_init_acl (struct inode *, struct inode *);
>   
>   static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
>   {
> +	inode->i_mode &= ~current_umask();
>   	return 0;
>   }
>   #endif
> diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
> index f892e54d0fcd..10791e97a46f 100644
> --- a/fs/jfs/jfs_acl.h
> +++ b/fs/jfs/jfs_acl.h
> @@ -17,6 +17,7 @@ int jfs_init_acl(tid_t, struct inode *, struct inode *);
>   static inline int jfs_init_acl(tid_t tid, struct inode *inode,
>   			       struct inode *dir)
>   {
> +	inode->i_mode &= ~current_umask();
>   	return 0;
>   }
>   
> diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
> index 0e65b3d634d9..54bc9b1061ca 100644
> --- a/include/linux/posix_acl.h
> +++ b/include/linux/posix_acl.h
> @@ -128,6 +128,7 @@ static inline void cache_no_acl(struct inode *inode)
>   static inline int posix_acl_create(struct inode *inode, umode_t *mode,
>   		struct posix_acl **default_acl, struct posix_acl **acl)
>   {
> +	*mode &= ~current_umask();
>   	*default_acl = *acl = NULL;
>   	return 0;
>   }
Xiubo Li Oct. 7, 2023, 1:19 a.m. UTC | #3
On 10/3/23 23:32, Dave Kleikamp wrote:
> I think this is sane, but the patch needs a description of why this is 
> necessary for these specific file systems.
>
Sounds reasonable.

Thanks

- Xiubo


> Thanks,
> Shaggy
>
> On 9/19/23 3:18AM, Max Kellermann wrote:
>> Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
>> ---
>>   fs/ceph/super.h           | 1 +
>>   fs/ext2/acl.h             | 1 +
>>   fs/jfs/jfs_acl.h          | 1 +
>>   include/linux/posix_acl.h | 1 +
>>   4 files changed, 4 insertions(+)
>>
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 51c7f2b14f6f..e7e2f264acf4 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -1194,6 +1194,7 @@ static inline void 
>> ceph_forget_all_cached_acls(struct inode *inode)
>>   static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
>>                        struct ceph_acl_sec_ctx *as_ctx)
>>   {
>> +    *mode &= ~current_umask();
>>       return 0;
>>   }
>>   static inline void ceph_init_inode_acls(struct inode *inode,
>> diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
>> index 4a8443a2b8ec..694af789c614 100644
>> --- a/fs/ext2/acl.h
>> +++ b/fs/ext2/acl.h
>> @@ -67,6 +67,7 @@ extern int ext2_init_acl (struct inode *, struct 
>> inode *);
>>     static inline int ext2_init_acl (struct inode *inode, struct 
>> inode *dir)
>>   {
>> +    inode->i_mode &= ~current_umask();
>>       return 0;
>>   }
>>   #endif
>> diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
>> index f892e54d0fcd..10791e97a46f 100644
>> --- a/fs/jfs/jfs_acl.h
>> +++ b/fs/jfs/jfs_acl.h
>> @@ -17,6 +17,7 @@ int jfs_init_acl(tid_t, struct inode *, struct 
>> inode *);
>>   static inline int jfs_init_acl(tid_t tid, struct inode *inode,
>>                      struct inode *dir)
>>   {
>> +    inode->i_mode &= ~current_umask();
>>       return 0;
>>   }
>>   diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
>> index 0e65b3d634d9..54bc9b1061ca 100644
>> --- a/include/linux/posix_acl.h
>> +++ b/include/linux/posix_acl.h
>> @@ -128,6 +128,7 @@ static inline void cache_no_acl(struct inode *inode)
>>   static inline int posix_acl_create(struct inode *inode, umode_t *mode,
>>           struct posix_acl **default_acl, struct posix_acl **acl)
>>   {
>> +    *mode &= ~current_umask();
>>       *default_acl = *acl = NULL;
>>       return 0;
>>   }
>
Max Kellermann Oct. 9, 2023, 2:45 p.m. UTC | #4
On Tue, Oct 3, 2023 at 5:32 PM Dave Kleikamp <dave.kleikamp@oracle.com> wrote:
> I think this is sane, but the patch needs a description of why this is
> necessary for these specific file systems.

Indeed the patch description was lacking, sorry. I sent v2 with a
better description.

Max
Dave Kleikamp Oct. 9, 2023, 4:49 p.m. UTC | #5
On 10/9/23 9:43AM, Max Kellermann wrote:
> One important implementation detail of the posix_acl_create() function
> is that it applies the umask to the "mode" parameter.  If
> CONFIG_FS_POSIX_ACL is disabled, this detail is missing and the umask
> may not get applied.
> 
> This patch adds the missing code to posix_acl_create() and to three
> filesystems that omit the posix_acl_create() call if their individual
> ACL support is disabled (CONFIG_EXT2_FS_POSIX_ACL,
> CONFIG_JFS_POSIX_ACL, CONFIG_CEPH_FS_POSIX_ACL).  If
> posix_acl_create() never gets called, the umask needs to be applied
> anyway.
> 
> This bug used to be exploitable easily with O_TMPFILE (see
> https://bugzilla.kernel.org/show_bug.cgi?id=203625) but that part was
> fixed by commit ac6800e279a2 ("fs: Add missing umask strip in
> vfs_tmpfile") last year.  The bug may not be reachable by userspace
> anymore, but since it is apparently still necessary to apply the umask
> again in posix_acl_create(), there is no reason to assume it's not
> necessary with ACL support is disabled.
> 
> Signed-off-by: Max Kellermann <max.kellermann@ionos.com>

Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>

> ---
>   fs/ceph/super.h           | 6 ++++++
>   fs/ext2/acl.h             | 6 ++++++
>   fs/jfs/jfs_acl.h          | 6 ++++++
>   include/linux/posix_acl.h | 1 +
>   4 files changed, 19 insertions(+)
> 
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 51c7f2b14f6f..58349639bd57 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1194,6 +1194,12 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
>   static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
>   				     struct ceph_acl_sec_ctx *as_ctx)
>   {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	 * ACL support is disabled at compile time, we need to do it
> +	 * here, because posix_acl_create() will never be called
> +	 */
> +	*mode &= ~current_umask();
> +
>   	return 0;
>   }
>   static inline void ceph_init_inode_acls(struct inode *inode,
> diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
> index 4a8443a2b8ec..0ecaa9c20c0c 100644
> --- a/fs/ext2/acl.h
> +++ b/fs/ext2/acl.h
> @@ -67,6 +67,12 @@ extern int ext2_init_acl (struct inode *, struct inode *);
>   
>   static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
>   {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	 * ACL support is disabled at compile time, we need to do it
> +	 * here, because posix_acl_create() will never be called
> +	 */
> +	inode->i_mode &= ~current_umask();
> +
>   	return 0;
>   }
>   #endif
> diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
> index f892e54d0fcd..64a05e663a45 100644
> --- a/fs/jfs/jfs_acl.h
> +++ b/fs/jfs/jfs_acl.h
> @@ -17,6 +17,12 @@ int jfs_init_acl(tid_t, struct inode *, struct inode *);
>   static inline int jfs_init_acl(tid_t tid, struct inode *inode,
>   			       struct inode *dir)
>   {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	 * ACL support is disabled at compile time, we need to do it
> +	 * here, because posix_acl_create() will never be called
> +	 */
> +	inode->i_mode &= ~current_umask();
> +
>   	return 0;
>   }
>   
> diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
> index 0e65b3d634d9..54bc9b1061ca 100644
> --- a/include/linux/posix_acl.h
> +++ b/include/linux/posix_acl.h
> @@ -128,6 +128,7 @@ static inline void cache_no_acl(struct inode *inode)
>   static inline int posix_acl_create(struct inode *inode, umode_t *mode,
>   		struct posix_acl **default_acl, struct posix_acl **acl)
>   {
> +	*mode &= ~current_umask();
>   	*default_acl = *acl = NULL;
>   	return 0;
>   }
Jan Kara Oct. 10, 2023, 1:11 p.m. UTC | #6
On Mon 09-10-23 16:43:39, Max Kellermann wrote:
> One important implementation detail of the posix_acl_create() function
> is that it applies the umask to the "mode" parameter.  If
> CONFIG_FS_POSIX_ACL is disabled, this detail is missing and the umask
> may not get applied.
> 
> This patch adds the missing code to posix_acl_create() and to three
> filesystems that omit the posix_acl_create() call if their individual
> ACL support is disabled (CONFIG_EXT2_FS_POSIX_ACL,
> CONFIG_JFS_POSIX_ACL, CONFIG_CEPH_FS_POSIX_ACL).  If
> posix_acl_create() never gets called, the umask needs to be applied
> anyway.
> 
> This bug used to be exploitable easily with O_TMPFILE (see
> https://bugzilla.kernel.org/show_bug.cgi?id=203625) but that part was
> fixed by commit ac6800e279a2 ("fs: Add missing umask strip in
> vfs_tmpfile") last year.  The bug may not be reachable by userspace
> anymore, but since it is apparently still necessary to apply the umask
> again in posix_acl_create(), there is no reason to assume it's not
> necessary with ACL support is disabled.
> 
> Signed-off-by: Max Kellermann <max.kellermann@ionos.com>

Thanks for the updated changelog! But as I'm looking into VFS code isn't
this already handled by mode_strip_umask() / vfs_prepare_mode() in
fs/namei.c? Because posix_acl_create() doesn't do anything to 'mode' for
!IS_POSIXACL() filesystems either. So at least ext2 (where I've checked
the mount option handling) does seem to have umask properly applied in all
the cases. But I might be missing something...

								Honza

> ---
>  fs/ceph/super.h           | 6 ++++++
>  fs/ext2/acl.h             | 6 ++++++
>  fs/jfs/jfs_acl.h          | 6 ++++++
>  include/linux/posix_acl.h | 1 +
>  4 files changed, 19 insertions(+)
> 
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 51c7f2b14f6f..58349639bd57 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1194,6 +1194,12 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
>  static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
>  				     struct ceph_acl_sec_ctx *as_ctx)
>  {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	 * ACL support is disabled at compile time, we need to do it
> +	 * here, because posix_acl_create() will never be called
> +	 */
> +	*mode &= ~current_umask();
> +
>  	return 0;
>  }
>  static inline void ceph_init_inode_acls(struct inode *inode,
> diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
> index 4a8443a2b8ec..0ecaa9c20c0c 100644
> --- a/fs/ext2/acl.h
> +++ b/fs/ext2/acl.h
> @@ -67,6 +67,12 @@ extern int ext2_init_acl (struct inode *, struct inode *);
>  
>  static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
>  {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	 * ACL support is disabled at compile time, we need to do it
> +	 * here, because posix_acl_create() will never be called
> +	 */
> +	inode->i_mode &= ~current_umask();
> +
>  	return 0;
>  }
>  #endif
> diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
> index f892e54d0fcd..64a05e663a45 100644
> --- a/fs/jfs/jfs_acl.h
> +++ b/fs/jfs/jfs_acl.h
> @@ -17,6 +17,12 @@ int jfs_init_acl(tid_t, struct inode *, struct inode *);
>  static inline int jfs_init_acl(tid_t tid, struct inode *inode,
>  			       struct inode *dir)
>  {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	 * ACL support is disabled at compile time, we need to do it
> +	 * here, because posix_acl_create() will never be called
> +	 */
> +	inode->i_mode &= ~current_umask();
> +
>  	return 0;
>  }
>  
> diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
> index 0e65b3d634d9..54bc9b1061ca 100644
> --- a/include/linux/posix_acl.h
> +++ b/include/linux/posix_acl.h
> @@ -128,6 +128,7 @@ static inline void cache_no_acl(struct inode *inode)
>  static inline int posix_acl_create(struct inode *inode, umode_t *mode,
>  		struct posix_acl **default_acl, struct posix_acl **acl)
>  {
> +	*mode &= ~current_umask();
>  	*default_acl = *acl = NULL;
>  	return 0;
>  }
> -- 
> 2.39.2
>
Max Kellermann Oct. 10, 2023, 1:17 p.m. UTC | #7
On Tue, Oct 10, 2023 at 3:11 PM Jan Kara <jack@suse.cz> wrote:
> Thanks for the updated changelog! But as I'm looking into VFS code isn't
> this already handled by mode_strip_umask() / vfs_prepare_mode() in
> fs/namei.c? Because posix_acl_create() doesn't do anything to 'mode' for
> !IS_POSIXACL() filesystems either. So at least ext2 (where I've checked
> the mount option handling) does seem to have umask properly applied in all
> the cases. But I might be missing something...

I'm not sure either. I was hoping the VFS experts could tell something
about how this API is supposed to be used and whose responsibility it
is to apply the umask. There used to be some confusion in the code, to
the point it was missing completely for O_TMPFILE. I'm still somewhat
confused. Maybe this is a chance to clear this confusion up and then
document it?

I wish there was one central place to apply the umask, and not spread
it around two (or more?) different code locations, depending on
whether there's an ACL. For my taste, that sort of policy is too error
prone for something as sensitive as umasks. After we already had the
O_TMPFILE vulnerability (which was only fixed last year, three
years(!) after I reported it).
Jan Kara Oct. 11, 2023, 10:05 a.m. UTC | #8
On Tue 10-10-23 15:17:17, Max Kellermann wrote:
> On Tue, Oct 10, 2023 at 3:11 PM Jan Kara <jack@suse.cz> wrote:
> > Thanks for the updated changelog! But as I'm looking into VFS code isn't
> > this already handled by mode_strip_umask() / vfs_prepare_mode() in
> > fs/namei.c? Because posix_acl_create() doesn't do anything to 'mode' for
> > !IS_POSIXACL() filesystems either. So at least ext2 (where I've checked
> > the mount option handling) does seem to have umask properly applied in all
> > the cases. But I might be missing something...
> 
> I'm not sure either. I was hoping the VFS experts could tell something
> about how this API is supposed to be used and whose responsibility it
> is to apply the umask. There used to be some confusion in the code, to
> the point it was missing completely for O_TMPFILE. I'm still somewhat
> confused. Maybe this is a chance to clear this confusion up and then
> document it?

So I've checked some more and the kernel doc comments before
mode_strip_umask() and vfs_prepare_mode() make it pretty obvious - all
paths creating new inodes must be calling vfs_prepare_mode(). As a result
mode_strip_umask() which handles umask stripping for filesystems not
supporting posix ACLs. For filesystems that do support ACLs,
posix_acl_create() must be call and that handles umask stripping. So your
fix should not be needed. CCed some relevant people for confirmation.

> I wish there was one central place to apply the umask, and not spread
> it around two (or more?) different code locations, depending on
> whether there's an ACL. For my taste, that sort of policy is too error
> prone for something as sensitive as umasks. After we already had the
> O_TMPFILE vulnerability (which was only fixed last year, three
> years(!) after I reported it).

I agree having umask stripping in two places is not great but it's
difficult to avoid with how posix ACLs are implemented and intertwined in
various filesystem implementations. At least the current design made it
quite a bit harder to forget to strip the umask.

								Honza
Max Kellermann Oct. 11, 2023, 10:51 a.m. UTC | #9
On Wed, Oct 11, 2023 at 12:05 PM Jan Kara <jack@suse.cz> wrote:
> So I've checked some more and the kernel doc comments before
> mode_strip_umask() and vfs_prepare_mode() make it pretty obvious - all
> paths creating new inodes must be calling vfs_prepare_mode(). As a result
> mode_strip_umask() which handles umask stripping for filesystems not
> supporting posix ACLs. For filesystems that do support ACLs,
> posix_acl_create() must be call and that handles umask stripping. So your
> fix should not be needed. CCed some relevant people for confirmation.

Thanks, Jan. Do you think the documentation is obvious enough, or
shall I look around and try to improve the documentation? I'm not a FS
expert, so it may be just my fault that it confused me.... I just
analyzed the O_TMPFILE vulnerability four years ago (because it was
reported to me as the maintainer of a userspace software).

Apart from my doubts that this API contract is too error prone, I'm
not quite sure if all filesystems really implement it properly.

For example, overlayfs unconditionally sets SB_POSIXACL, even if the
kernel has no ACL support. Would this ignore the umask? I'm not sure,
overlayfs is a special beast.
Then there's orangefs which allows setting the "acl" mount option (and
thus SB_POSIXACL) even if the kernel has no ACL support. Same for gfs2
and maybe cifs, maybe more, I didn't check them all.

The "mainstream" filesystems like ext4 seem to be implemented
properly, though this is still too fragile for my taste... ext4 has
the SB_POSIXACL code even if there's no kernel ACL support, but it is
not reachable because EXT4_MOUNT_POSIX_ACL cannot be set from
userspace in that case. The code looks suspicious, but is okay in the
end - still not my taste.

I see so much redundant code regarding the "acl" mount option in all
filesystems. I believe the API should be designed in a way that it is
safe-by-default, and shouldn't need very careful considerations in
each and every filesystem, or else all filesystems repeat the same
mistakes until the last one gets fixed.

Max
Jan Kara Oct. 11, 2023, 12:06 p.m. UTC | #10
On Wed 11-10-23 12:51:12, Max Kellermann wrote:
> On Wed, Oct 11, 2023 at 12:05 PM Jan Kara <jack@suse.cz> wrote:
> > So I've checked some more and the kernel doc comments before
> > mode_strip_umask() and vfs_prepare_mode() make it pretty obvious - all
> > paths creating new inodes must be calling vfs_prepare_mode(). As a result
> > mode_strip_umask() which handles umask stripping for filesystems not
> > supporting posix ACLs. For filesystems that do support ACLs,
> > posix_acl_create() must be call and that handles umask stripping. So your
> > fix should not be needed. CCed some relevant people for confirmation.
> 
> Thanks, Jan. Do you think the documentation is obvious enough, or
> shall I look around and try to improve the documentation? I'm not a FS
> expert, so it may be just my fault that it confused me.... I just
> analyzed the O_TMPFILE vulnerability four years ago (because it was
> reported to me as the maintainer of a userspace software).
> 
> Apart from my doubts that this API contract is too error prone, I'm
> not quite sure if all filesystems really implement it properly.
> 
> For example, overlayfs unconditionally sets SB_POSIXACL, even if the
> kernel has no ACL support. Would this ignore the umask? I'm not sure,
> overlayfs is a special beast.
> Then there's orangefs which allows setting the "acl" mount option (and
> thus SB_POSIXACL) even if the kernel has no ACL support. Same for gfs2
> and maybe cifs, maybe more, I didn't check them all.

Indeed, *that* looks like a bug. Good spotting! I'd say posix_acl_create()
defined in include/linux/posix_acl.h for the !CONFIG_FS_POSIX_ACL case
should be stripping mode using umask. Care to send a patch for this?

> The "mainstream" filesystems like ext4 seem to be implemented
> properly, though this is still too fragile for my taste... ext4 has
> the SB_POSIXACL code even if there's no kernel ACL support, but it is
> not reachable because EXT4_MOUNT_POSIX_ACL cannot be set from
> userspace in that case. The code looks suspicious, but is okay in the
> end - still not my taste.
> 
> I see so much redundant code regarding the "acl" mount option in all
> filesystems. I believe the API should be designed in a way that it is
> safe-by-default, and shouldn't need very careful considerations in
> each and every filesystem, or else all filesystems repeat the same
> mistakes until the last one gets fixed.

So I definitely agree that we should handle as many things as possible in
VFS without relying on filesystems to get it right. Thus I agree VFS should
do the right thing even if the filesystem sets SB_POSIXACl when
!CONFIG_FS_POSIX_ACL.

								Honza
Max Kellermann Oct. 11, 2023, 12:18 p.m. UTC | #11
On Wed, Oct 11, 2023 at 2:07 PM Jan Kara <jack@suse.cz> wrote:
> Indeed, *that* looks like a bug. Good spotting! I'd say posix_acl_create()
> defined in include/linux/posix_acl.h for the !CONFIG_FS_POSIX_ACL case
> should be stripping mode using umask. Care to send a patch for this?

You mean like the patch you're commenting on right now? ;-)

But without the other filesystems. I'll resend it with just the
posix_acl.h hunk.
Jan Kara Oct. 11, 2023, 12:27 p.m. UTC | #12
On Wed 11-10-23 14:18:45, Max Kellermann wrote:
> On Wed, Oct 11, 2023 at 2:07 PM Jan Kara <jack@suse.cz> wrote:
> > Indeed, *that* looks like a bug. Good spotting! I'd say posix_acl_create()
> > defined in include/linux/posix_acl.h for the !CONFIG_FS_POSIX_ACL case
> > should be stripping mode using umask. Care to send a patch for this?
> 
> You mean like the patch you're commenting on right now? ;-)

Yeah, OK, that was a bit silly ;) I was too concentrated on the filesystem
bits.

> But without the other filesystems. I'll resend it with just the
> posix_acl.h hunk.

Yup, and a bit massaged changelog... Thanks a lot!

								Honza
Max Kellermann Oct. 11, 2023, 12:27 p.m. UTC | #13
On Wed, Oct 11, 2023 at 2:18 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> But without the other filesystems. I'll resend it with just the
> posix_acl.h hunk.

Thinking again, I don't think this is the proper solution. This may
server as a workaround so those broken filesystems don't suffer from
this bug, but it's not proper.

posix_acl_create() is only supposed to appy the umask if the inode
supports ACLs; if not, the VFS is supposed to do it. But if the
filesystem pretends to have ACL support but the kernel does not, it's
really a filesystem bug. Hacking the umask code into
posix_acl_create() for that inconsistent case doesn't sound right.

A better workaround would be this patch:
https://patchwork.kernel.org/project/linux-nfs/patch/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag/
I submitted it more than 5 years ago, it got one positive review, but
was never merged.

This patch enables the VFS's umask code even if the filesystem
prerents to support ACLs. This still doesn't fix the filesystem bug,
but makes VFS's behavior consistent.

Max
Jan Kara Oct. 11, 2023, 1:59 p.m. UTC | #14
On Wed 11-10-23 14:27:49, Max Kellermann wrote:
> On Wed, Oct 11, 2023 at 2:18 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> > But without the other filesystems. I'll resend it with just the
> > posix_acl.h hunk.
> 
> Thinking again, I don't think this is the proper solution. This may
> server as a workaround so those broken filesystems don't suffer from
> this bug, but it's not proper.
> 
> posix_acl_create() is only supposed to appy the umask if the inode
> supports ACLs; if not, the VFS is supposed to do it. But if the
> filesystem pretends to have ACL support but the kernel does not, it's
> really a filesystem bug. Hacking the umask code into
> posix_acl_create() for that inconsistent case doesn't sound right.
> 
> A better workaround would be this patch:
> https://patchwork.kernel.org/project/linux-nfs/patch/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag/
> I submitted it more than 5 years ago, it got one positive review, but
> was never merged.
> 
> This patch enables the VFS's umask code even if the filesystem
> prerents to support ACLs. This still doesn't fix the filesystem bug,
> but makes VFS's behavior consistent.

OK, that solution works for me as well. I agree it seems a tad bit cleaner.
Christian, which one would you prefer?

								Honza
Christian Brauner Oct. 11, 2023, 3:27 p.m. UTC | #15
On Wed, Oct 11, 2023 at 03:59:22PM +0200, Jan Kara wrote:
> On Wed 11-10-23 14:27:49, Max Kellermann wrote:
> > On Wed, Oct 11, 2023 at 2:18 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> > > But without the other filesystems. I'll resend it with just the
> > > posix_acl.h hunk.
> > 
> > Thinking again, I don't think this is the proper solution. This may
> > server as a workaround so those broken filesystems don't suffer from
> > this bug, but it's not proper.
> > 
> > posix_acl_create() is only supposed to appy the umask if the inode
> > supports ACLs; if not, the VFS is supposed to do it. But if the
> > filesystem pretends to have ACL support but the kernel does not, it's
> > really a filesystem bug. Hacking the umask code into
> > posix_acl_create() for that inconsistent case doesn't sound right.
> > 
> > A better workaround would be this patch:
> > https://patchwork.kernel.org/project/linux-nfs/patch/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag/
> > I submitted it more than 5 years ago, it got one positive review, but
> > was never merged.
> > 
> > This patch enables the VFS's umask code even if the filesystem
> > prerents to support ACLs. This still doesn't fix the filesystem bug,
> > but makes VFS's behavior consistent.
> 
> OK, that solution works for me as well. I agree it seems a tad bit cleaner.
> Christian, which one would you prefer?

So it always bugged me that POSIX ACLs push umask stripping down into
the individual filesystems but it's hard to get rid of this. And we
tried to improve the situation during the POSIX ACL rework by
introducing vfs_prepare_umask().

Aside from that, the problem had been that filesystems like nfs v4
intentionally raised SB_POSIXACL to prevent umask stripping in the VFS.
IOW, for them SB_POSIXACL was equivalent to "don't apply any umask".

And afaict nfs v4 has it's own thing going on how and where umasks are
applied. However, since we now have the following commit in vfs.misc:

commit f61b9bb3f8386a5e59b49bf1310f5b34f47bcef9
Author:     Jeff Layton <jlayton@kernel.org>
AuthorDate: Mon Sep 11 20:25:50 2023 -0400
Commit:     Christian Brauner <brauner@kernel.org>
CommitDate: Thu Sep 21 15:37:47 2023 +0200

    fs: add a new SB_I_NOUMASK flag

    SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
    also sets this flag to prevent the VFS from applying the umask on
    newly-created files. NFSv4 doesn't support POSIX ACLs however, which
    causes confusion when other subsystems try to test for them.

    Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
    stripping without advertising support for POSIX ACLs. Set the new flag
    on NFSv4 instead of SB_POSIXACL.

    Also, move mode_strip_umask to namei.h and convert init_mknod and
    init_mkdir to use it.

    Signed-off-by: Jeff Layton <jlayton@kernel.org>
    Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
    Signed-off-by: Christian Brauner <brauner@kernel.org>

I think it's possible to pick up the first patch linked above:
   
fix umask on NFS with CONFIG_FS_POSIX_ACL=n doesn't lead to any

and see whether we see any regressions from this.

The second patch I can't easily judge that should go through nfs if at
all.

So proposal/question: should we take the first patch into vfs.misc?
Jan Kara Oct. 11, 2023, 4:29 p.m. UTC | #16
On Wed 11-10-23 17:27:37, Christian Brauner wrote:
> On Wed, Oct 11, 2023 at 03:59:22PM +0200, Jan Kara wrote:
> > On Wed 11-10-23 14:27:49, Max Kellermann wrote:
> > > On Wed, Oct 11, 2023 at 2:18 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> > > > But without the other filesystems. I'll resend it with just the
> > > > posix_acl.h hunk.
> > > 
> > > Thinking again, I don't think this is the proper solution. This may
> > > server as a workaround so those broken filesystems don't suffer from
> > > this bug, but it's not proper.
> > > 
> > > posix_acl_create() is only supposed to appy the umask if the inode
> > > supports ACLs; if not, the VFS is supposed to do it. But if the
> > > filesystem pretends to have ACL support but the kernel does not, it's
> > > really a filesystem bug. Hacking the umask code into
> > > posix_acl_create() for that inconsistent case doesn't sound right.
> > > 
> > > A better workaround would be this patch:
> > > https://patchwork.kernel.org/project/linux-nfs/patch/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag/
> > > I submitted it more than 5 years ago, it got one positive review, but
> > > was never merged.
> > > 
> > > This patch enables the VFS's umask code even if the filesystem
> > > prerents to support ACLs. This still doesn't fix the filesystem bug,
> > > but makes VFS's behavior consistent.
> > 
> > OK, that solution works for me as well. I agree it seems a tad bit cleaner.
> > Christian, which one would you prefer?
> 
> So it always bugged me that POSIX ACLs push umask stripping down into
> the individual filesystems but it's hard to get rid of this. And we
> tried to improve the situation during the POSIX ACL rework by
> introducing vfs_prepare_umask().
> 
> Aside from that, the problem had been that filesystems like nfs v4
> intentionally raised SB_POSIXACL to prevent umask stripping in the VFS.
> IOW, for them SB_POSIXACL was equivalent to "don't apply any umask".

Ah, what a hack...

> And afaict nfs v4 has it's own thing going on how and where umasks are
> applied. However, since we now have the following commit in vfs.misc:
> 
> commit f61b9bb3f8386a5e59b49bf1310f5b34f47bcef9
> Author:     Jeff Layton <jlayton@kernel.org>
> AuthorDate: Mon Sep 11 20:25:50 2023 -0400
> Commit:     Christian Brauner <brauner@kernel.org>
> CommitDate: Thu Sep 21 15:37:47 2023 +0200
> 
>     fs: add a new SB_I_NOUMASK flag
> 
>     SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
>     also sets this flag to prevent the VFS from applying the umask on
>     newly-created files. NFSv4 doesn't support POSIX ACLs however, which
>     causes confusion when other subsystems try to test for them.
> 
>     Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
>     stripping without advertising support for POSIX ACLs. Set the new flag
>     on NFSv4 instead of SB_POSIXACL.
> 
>     Also, move mode_strip_umask to namei.h and convert init_mknod and
>     init_mkdir to use it.
> 
>     Signed-off-by: Jeff Layton <jlayton@kernel.org>
>     Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
>     Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> I think it's possible to pick up the first patch linked above:
>    
> fix umask on NFS with CONFIG_FS_POSIX_ACL=n doesn't lead to any
> 
> and see whether we see any regressions from this.
> 
> The second patch I can't easily judge that should go through nfs if at
> all.
> 
> So proposal/question: should we take the first patch into vfs.misc?

Sounds good to me. I have checked whether some other filesystem does not
try to play similar games as NFS and it appears not although overlayfs does
seem to play some games with umasks.

								Honza
Theodore Ts'o Oct. 11, 2023, 5 p.m. UTC | #17
On Wed, Oct 11, 2023 at 05:27:37PM +0200, Christian Brauner wrote:
> Aside from that, the problem had been that filesystems like nfs v4
> intentionally raised SB_POSIXACL to prevent umask stripping in the VFS.
> IOW, for them SB_POSIXACL was equivalent to "don't apply any umask".
> 
> And afaict nfs v4 has it's own thing going on how and where umasks are
> applied. However, since we now have the following commit in vfs.misc:
> 
>     fs: add a new SB_I_NOUMASK flag

To summarize, just to make sure I understand where we're going.  Since
normally (excepting unusual cases like NFS), it's fine to strip the
umask bits twice (once in the VFS, and once in the file system, for
those file systems that are doing it), once we have SB_I_NOUMASK and
NFS starts using it, then the VFS can just unconditionally strip the
umask bits, and then we can gradually clean up the file system umask
handling (which would then be harmlessly duplicative).

Did I get this right?

					- Ted
Jan Kara Oct. 11, 2023, 5:26 p.m. UTC | #18
On Wed 11-10-23 13:00:42, Theodore Ts'o wrote:
> On Wed, Oct 11, 2023 at 05:27:37PM +0200, Christian Brauner wrote:
> > Aside from that, the problem had been that filesystems like nfs v4
> > intentionally raised SB_POSIXACL to prevent umask stripping in the VFS.
> > IOW, for them SB_POSIXACL was equivalent to "don't apply any umask".
> > 
> > And afaict nfs v4 has it's own thing going on how and where umasks are
> > applied. However, since we now have the following commit in vfs.misc:
> > 
> >     fs: add a new SB_I_NOUMASK flag
> 
> To summarize, just to make sure I understand where we're going.  Since
> normally (excepting unusual cases like NFS), it's fine to strip the
> umask bits twice (once in the VFS, and once in the file system, for
> those file systems that are doing it), once we have SB_I_NOUMASK and
> NFS starts using it, then the VFS can just unconditionally strip the
> umask bits, and then we can gradually clean up the file system umask
> handling (which would then be harmlessly duplicative).
> 
> Did I get this right?

I don't think this is accurate. posix_acl_create() needs unmasked 'mode'
because instead of using current_umask() for masking it wants to use
whatever is stored in the ACLs as an umask.

So I still think we need to keep umask handling in both posix_acl_create()
and vfs_prepare_mode(). But filesystem's only obligation would be to call
posix_acl_create() if the inode is IS_POSIXACL. No more caring about when
to apply umask and when not based on config or mount options.

								Honza
Christian Brauner Oct. 12, 2023, 9:22 a.m. UTC | #19
On Wed, Oct 11, 2023 at 06:29:04PM +0200, Jan Kara wrote:
> On Wed 11-10-23 17:27:37, Christian Brauner wrote:
> > On Wed, Oct 11, 2023 at 03:59:22PM +0200, Jan Kara wrote:
> > > On Wed 11-10-23 14:27:49, Max Kellermann wrote:
> > > > On Wed, Oct 11, 2023 at 2:18 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> > > > > But without the other filesystems. I'll resend it with just the
> > > > > posix_acl.h hunk.
> > > > 
> > > > Thinking again, I don't think this is the proper solution. This may
> > > > server as a workaround so those broken filesystems don't suffer from
> > > > this bug, but it's not proper.
> > > > 
> > > > posix_acl_create() is only supposed to appy the umask if the inode
> > > > supports ACLs; if not, the VFS is supposed to do it. But if the
> > > > filesystem pretends to have ACL support but the kernel does not, it's
> > > > really a filesystem bug. Hacking the umask code into
> > > > posix_acl_create() for that inconsistent case doesn't sound right.
> > > > 
> > > > A better workaround would be this patch:
> > > > https://patchwork.kernel.org/project/linux-nfs/patch/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag/
> > > > I submitted it more than 5 years ago, it got one positive review, but
> > > > was never merged.
> > > > 
> > > > This patch enables the VFS's umask code even if the filesystem
> > > > prerents to support ACLs. This still doesn't fix the filesystem bug,
> > > > but makes VFS's behavior consistent.
> > > 
> > > OK, that solution works for me as well. I agree it seems a tad bit cleaner.
> > > Christian, which one would you prefer?
> > 
> > So it always bugged me that POSIX ACLs push umask stripping down into
> > the individual filesystems but it's hard to get rid of this. And we
> > tried to improve the situation during the POSIX ACL rework by
> > introducing vfs_prepare_umask().
> > 
> > Aside from that, the problem had been that filesystems like nfs v4
> > intentionally raised SB_POSIXACL to prevent umask stripping in the VFS.
> > IOW, for them SB_POSIXACL was equivalent to "don't apply any umask".
> 
> Ah, what a hack...
> 
> > And afaict nfs v4 has it's own thing going on how and where umasks are
> > applied. However, since we now have the following commit in vfs.misc:
> > 
> > commit f61b9bb3f8386a5e59b49bf1310f5b34f47bcef9
> > Author:     Jeff Layton <jlayton@kernel.org>
> > AuthorDate: Mon Sep 11 20:25:50 2023 -0400
> > Commit:     Christian Brauner <brauner@kernel.org>
> > CommitDate: Thu Sep 21 15:37:47 2023 +0200
> > 
> >     fs: add a new SB_I_NOUMASK flag
> > 
> >     SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
> >     also sets this flag to prevent the VFS from applying the umask on
> >     newly-created files. NFSv4 doesn't support POSIX ACLs however, which
> >     causes confusion when other subsystems try to test for them.
> > 
> >     Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
> >     stripping without advertising support for POSIX ACLs. Set the new flag
> >     on NFSv4 instead of SB_POSIXACL.
> > 
> >     Also, move mode_strip_umask to namei.h and convert init_mknod and
> >     init_mkdir to use it.
> > 
> >     Signed-off-by: Jeff Layton <jlayton@kernel.org>
> >     Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
> >     Signed-off-by: Christian Brauner <brauner@kernel.org>
> > 
> > I think it's possible to pick up the first patch linked above:
> >    
> > fix umask on NFS with CONFIG_FS_POSIX_ACL=n doesn't lead to any
> > 
> > and see whether we see any regressions from this.
> > 
> > The second patch I can't easily judge that should go through nfs if at
> > all.
> > 
> > So proposal/question: should we take the first patch into vfs.misc?
> 
> Sounds good to me. I have checked whether some other filesystem does not
> try to play similar games as NFS and it appears not although overlayfs does
> seem to play some games with umasks.

I think that overlayfs sets SB_POSIXACL unconditionally to ensure that
the upper filesystem can decide where the umask needs to be stripped. If
the upper filesystem doesn't have SB_POSIXACL then the umask will be
stripped directly in e.g., vfs_create(), and vfs_tmpfile(). If it does
then it will be done in the upper filesystems.

So with the patch I linked above that we have in vfs.misc we should be
able to  change overlayfs to behave similar to NFS:

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 9f43f0d303ad..361189b676b0 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1489,8 +1489,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
        sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
                ovl_trusted_xattr_handlers;
        sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
        sb->s_flags |= SB_POSIXACL;
+#endif
        sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+       /*
+        * Ensure that umask handling is done by the filesystems used
+        * for the the upper layer instead of overlayfs as that would
+        * lead to unexpected results.
+        */
+       sb->s_iflags |= SB_I_NOUMASK;

        err = -ENOMEM;
        root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);

Which means that umask handling will be done by the upper filesystems
just as is done right now and overlayfs can stop advertising SB_POSIXACL
support on a kernel that doesn't have support for it compiled in.

How does that sound?
Jan Kara Oct. 12, 2023, 9:41 a.m. UTC | #20
On Thu 12-10-23 11:22:29, Christian Brauner wrote:
> On Wed, Oct 11, 2023 at 06:29:04PM +0200, Jan Kara wrote:
> > On Wed 11-10-23 17:27:37, Christian Brauner wrote:
> > > On Wed, Oct 11, 2023 at 03:59:22PM +0200, Jan Kara wrote:
> > > > On Wed 11-10-23 14:27:49, Max Kellermann wrote:
> > > > > On Wed, Oct 11, 2023 at 2:18 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> > > > > > But without the other filesystems. I'll resend it with just the
> > > > > > posix_acl.h hunk.
> > > > > 
> > > > > Thinking again, I don't think this is the proper solution. This may
> > > > > server as a workaround so those broken filesystems don't suffer from
> > > > > this bug, but it's not proper.
> > > > > 
> > > > > posix_acl_create() is only supposed to appy the umask if the inode
> > > > > supports ACLs; if not, the VFS is supposed to do it. But if the
> > > > > filesystem pretends to have ACL support but the kernel does not, it's
> > > > > really a filesystem bug. Hacking the umask code into
> > > > > posix_acl_create() for that inconsistent case doesn't sound right.
> > > > > 
> > > > > A better workaround would be this patch:
> > > > > https://patchwork.kernel.org/project/linux-nfs/patch/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag/
> > > > > I submitted it more than 5 years ago, it got one positive review, but
> > > > > was never merged.
> > > > > 
> > > > > This patch enables the VFS's umask code even if the filesystem
> > > > > prerents to support ACLs. This still doesn't fix the filesystem bug,
> > > > > but makes VFS's behavior consistent.
> > > > 
> > > > OK, that solution works for me as well. I agree it seems a tad bit cleaner.
> > > > Christian, which one would you prefer?
> > > 
> > > So it always bugged me that POSIX ACLs push umask stripping down into
> > > the individual filesystems but it's hard to get rid of this. And we
> > > tried to improve the situation during the POSIX ACL rework by
> > > introducing vfs_prepare_umask().
> > > 
> > > Aside from that, the problem had been that filesystems like nfs v4
> > > intentionally raised SB_POSIXACL to prevent umask stripping in the VFS.
> > > IOW, for them SB_POSIXACL was equivalent to "don't apply any umask".
> > 
> > Ah, what a hack...
> > 
> > > And afaict nfs v4 has it's own thing going on how and where umasks are
> > > applied. However, since we now have the following commit in vfs.misc:
> > > 
> > > commit f61b9bb3f8386a5e59b49bf1310f5b34f47bcef9
> > > Author:     Jeff Layton <jlayton@kernel.org>
> > > AuthorDate: Mon Sep 11 20:25:50 2023 -0400
> > > Commit:     Christian Brauner <brauner@kernel.org>
> > > CommitDate: Thu Sep 21 15:37:47 2023 +0200
> > > 
> > >     fs: add a new SB_I_NOUMASK flag
> > > 
> > >     SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
> > >     also sets this flag to prevent the VFS from applying the umask on
> > >     newly-created files. NFSv4 doesn't support POSIX ACLs however, which
> > >     causes confusion when other subsystems try to test for them.
> > > 
> > >     Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
> > >     stripping without advertising support for POSIX ACLs. Set the new flag
> > >     on NFSv4 instead of SB_POSIXACL.
> > > 
> > >     Also, move mode_strip_umask to namei.h and convert init_mknod and
> > >     init_mkdir to use it.
> > > 
> > >     Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > >     Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
> > >     Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > 
> > > I think it's possible to pick up the first patch linked above:
> > >    
> > > fix umask on NFS with CONFIG_FS_POSIX_ACL=n doesn't lead to any
> > > 
> > > and see whether we see any regressions from this.
> > > 
> > > The second patch I can't easily judge that should go through nfs if at
> > > all.
> > > 
> > > So proposal/question: should we take the first patch into vfs.misc?
> > 
> > Sounds good to me. I have checked whether some other filesystem does not
> > try to play similar games as NFS and it appears not although overlayfs does
> > seem to play some games with umasks.
> 
> I think that overlayfs sets SB_POSIXACL unconditionally to ensure that
> the upper filesystem can decide where the umask needs to be stripped. If
> the upper filesystem doesn't have SB_POSIXACL then the umask will be
> stripped directly in e.g., vfs_create(), and vfs_tmpfile(). If it does
> then it will be done in the upper filesystems.
> 
> So with the patch I linked above that we have in vfs.misc we should be
> able to  change overlayfs to behave similar to NFS:

Yep, I was thinking that this might be what overlayfs wants. But I know
far to few about overlayfs to be sure ;) That's why I've CCed Amir in my
previous email...

								Honza

> 
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index 9f43f0d303ad..361189b676b0 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -1489,8 +1489,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
>         sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
>                 ovl_trusted_xattr_handlers;
>         sb->s_fs_info = ofs;
> +#ifdef CONFIG_FS_POSIX_ACL
>         sb->s_flags |= SB_POSIXACL;
> +#endif
>         sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
> +       /*
> +        * Ensure that umask handling is done by the filesystems used
> +        * for the the upper layer instead of overlayfs as that would
> +        * lead to unexpected results.
> +        */
> +       sb->s_iflags |= SB_I_NOUMASK;
> 
>         err = -ENOMEM;
>         root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
> 
> Which means that umask handling will be done by the upper filesystems
> just as is done right now and overlayfs can stop advertising SB_POSIXACL
> support on a kernel that doesn't have support for it compiled in.
> 
> How does that sound?
Theodore Ts'o Oct. 12, 2023, 2:29 p.m. UTC | #21
On Wed, Oct 11, 2023 at 07:26:06PM +0200, Jan Kara wrote:
> I don't think this is accurate. posix_acl_create() needs unmasked 'mode'
> because instead of using current_umask() for masking it wants to use
> whatever is stored in the ACLs as an umask.
> 
> So I still think we need to keep umask handling in both posix_acl_create()
> and vfs_prepare_mode(). But filesystem's only obligation would be to call
> posix_acl_create() if the inode is IS_POSIXACL. No more caring about when
> to apply umask and when not based on config or mount options.

Ah, right, thanks for the clarification.  I *think* the following
patch in the ext4 dev branch (not yet in Linus's tree, but it should
be in linux-next) should be harmless, though, right?  And once we get
the changes in vfs_prepare_mode() we can revert in ext4 --- or do
folks I think I should just drop it from the ext4 dev branch now?

Thanks,

						- Ted

commit 484fd6c1de13b336806a967908a927cc0356e312
Author: Max Kellermann <max.kellermann@ionos.com>
Date:   Tue Sep 19 10:18:23 2023 +0200

    ext4: apply umask if ACL support is disabled
    
    The function ext4_init_acl() calls posix_acl_create() which is
    responsible for applying the umask.  But without
    CONFIG_EXT4_FS_POSIX_ACL, ext4_init_acl() is an empty inline function,
    and nobody applies the umask.
    
    This fixes a bug which causes the umask to be ignored with O_TMPFILE
    on ext4:
    
     https://github.com/MusicPlayerDaemon/MPD/issues/558
     https://bugs.gentoo.org/show_bug.cgi?id=686142#c3
     https://bugzilla.kernel.org/show_bug.cgi?id=203625
    
    Reviewed-by: "J. Bruce Fields" <bfields@redhat.com>
    Cc: stable@vger.kernel.org
    Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
    Link: https://lore.kernel.org/r/20230919081824.1096619-1-max.kellermann@ionos.com
    Signed-off-by: Theodore Ts'o <tytso@mit.edu>

diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 0c5a79c3b5d4..ef4c19e5f570 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
+	/* usually, the umask is applied by posix_acl_create(), but if
+	   ext4 ACL support is disabled at compile time, we need to do
+	   it here, because posix_acl_create() will never be called */
+	inode->i_mode &= ~current_umask();
+
 	return 0;
 }
 #endif  /* CONFIG_EXT4_FS_POSIX_ACL */
Jan Kara Oct. 12, 2023, 2:42 p.m. UTC | #22
On Thu 12-10-23 10:29:18, Theodore Ts'o wrote:
> On Wed, Oct 11, 2023 at 07:26:06PM +0200, Jan Kara wrote:
> > I don't think this is accurate. posix_acl_create() needs unmasked 'mode'
> > because instead of using current_umask() for masking it wants to use
> > whatever is stored in the ACLs as an umask.
> > 
> > So I still think we need to keep umask handling in both posix_acl_create()
> > and vfs_prepare_mode(). But filesystem's only obligation would be to call
> > posix_acl_create() if the inode is IS_POSIXACL. No more caring about when
> > to apply umask and when not based on config or mount options.
> 
> Ah, right, thanks for the clarification.  I *think* the following
> patch in the ext4 dev branch (not yet in Linus's tree, but it should
> be in linux-next) should be harmless, though, right?  And once we get
> the changes in vfs_prepare_mode() we can revert in ext4 --- or do
> folks I think I should just drop it from the ext4 dev branch now?

It definitely does no harm. As you say, you can revert it once the VFS
changes land if you want.

								Honza

> commit 484fd6c1de13b336806a967908a927cc0356e312
> Author: Max Kellermann <max.kellermann@ionos.com>
> Date:   Tue Sep 19 10:18:23 2023 +0200
> 
>     ext4: apply umask if ACL support is disabled
>     
>     The function ext4_init_acl() calls posix_acl_create() which is
>     responsible for applying the umask.  But without
>     CONFIG_EXT4_FS_POSIX_ACL, ext4_init_acl() is an empty inline function,
>     and nobody applies the umask.
>     
>     This fixes a bug which causes the umask to be ignored with O_TMPFILE
>     on ext4:
>     
>      https://github.com/MusicPlayerDaemon/MPD/issues/558
>      https://bugs.gentoo.org/show_bug.cgi?id=686142#c3
>      https://bugzilla.kernel.org/show_bug.cgi?id=203625
>     
>     Reviewed-by: "J. Bruce Fields" <bfields@redhat.com>
>     Cc: stable@vger.kernel.org
>     Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
>     Link: https://lore.kernel.org/r/20230919081824.1096619-1-max.kellermann@ionos.com
>     Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> 
> diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
> index 0c5a79c3b5d4..ef4c19e5f570 100644
> --- a/fs/ext4/acl.h
> +++ b/fs/ext4/acl.h
> @@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
>  static inline int
>  ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
>  {
> +	/* usually, the umask is applied by posix_acl_create(), but if
> +	   ext4 ACL support is disabled at compile time, we need to do
> +	   it here, because posix_acl_create() will never be called */
> +	inode->i_mode &= ~current_umask();
> +
>  	return 0;
>  }
>  #endif  /* CONFIG_EXT4_FS_POSIX_ACL */
Michael Forney March 13, 2024, 8:40 p.m. UTC | #23
Jan Kara <jack@suse.cz> wrote:
> On Thu 12-10-23 10:29:18, Theodore Ts'o wrote:
> > On Wed, Oct 11, 2023 at 07:26:06PM +0200, Jan Kara wrote:
> > > I don't think this is accurate. posix_acl_create() needs unmasked 'mode'
> > > because instead of using current_umask() for masking it wants to use
> > > whatever is stored in the ACLs as an umask.
> > > 
> > > So I still think we need to keep umask handling in both posix_acl_create()
> > > and vfs_prepare_mode(). But filesystem's only obligation would be to call
> > > posix_acl_create() if the inode is IS_POSIXACL. No more caring about when
> > > to apply umask and when not based on config or mount options.
> > 
> > Ah, right, thanks for the clarification.  I *think* the following
> > patch in the ext4 dev branch (not yet in Linus's tree, but it should
> > be in linux-next) should be harmless, though, right?  And once we get
> > the changes in vfs_prepare_mode() we can revert in ext4 --- or do
> > folks I think I should just drop it from the ext4 dev branch now?
> 
> It definitely does no harm. As you say, you can revert it once the VFS
> changes land if you want.

I've been debugging why flatpak was always considering its database
corrupted, and found this commit to be the source of the issue.

$ ostree --repo=repo --mode=bare-user-only init
$ mkdir tree && umask 0 && ln -s target tree/symlink && umask 022
$ ostree --repo=repo commit --branch=foo tree/
c508e0564267b376661889b9016f8438bd6d39412078838f78856383fdd8ac2f
$ ostree --repo=repo fsck
Validating refs...
Validating refs in collections...
Enumerating commits...
Verifying content integrity of 1 commit objects...
fsck objects (1/4) [===          ]  25%
error: In commits c508e0564267b376661889b9016f8438bd6d39412078838f78856383fdd8ac2f: fsck content object a6b40a5400ed082fbe067d2c8397aab54046a089768651c392a36db46d24c1cd: Corrupted file object; checksum expected='a6b40a5400ed082fbe067d2c8397aab54046a089768651c392a36db46d24c1cd'
actual='6bdc88f9722f96dbd51735e381f8a1b0e01363e1d7ee2edbb474c091f83c3987'
$

Turns out that symlinks are inheriting umask on my system (which
has CONFIG_EXT4_FS_POSIX_ACL=n):

$ umask 022
$ ln -s target symlink
$ ls -l symlink
lrwxr-xr-x    1 michael  michael           6 Mar 13 13:28 symlink -> target
$

Looking at the referenced functions, posix_acl_create() returns
early before applying umask for symlinks, but ext4_init_acl() now
applies the umask unconditionally.

After reverting this commit, it works correctly. I am also unable
to reproduce the mentioned issue with O_TMPFILE after reverting the
commit. It seems that the bug was fixed properly in ac6800e279a2
('fs: Add missing umask strip in vfs_tmpfile'), and all branches
that have this ext4_init_acl patch already had ac6800e279a2 backported.

So I think this patch should be reverted, since the bug was already
fixed and it breaks symlink modes. If not, it should at least be
changed to not to apply the umask to symlinks.

> > commit 484fd6c1de13b336806a967908a927cc0356e312
> > Author: Max Kellermann <max.kellermann@ionos.com>
> > Date:   Tue Sep 19 10:18:23 2023 +0200
> > 
> >     ext4: apply umask if ACL support is disabled
> >     
> >     The function ext4_init_acl() calls posix_acl_create() which is
> >     responsible for applying the umask.  But without
> >     CONFIG_EXT4_FS_POSIX_ACL, ext4_init_acl() is an empty inline function,
> >     and nobody applies the umask.
> >     
> >     This fixes a bug which causes the umask to be ignored with O_TMPFILE
> >     on ext4:
> >     
> >      https://github.com/MusicPlayerDaemon/MPD/issues/558
> >      https://bugs.gentoo.org/show_bug.cgi?id=686142#c3
> >      https://bugzilla.kernel.org/show_bug.cgi?id=203625
> >     
> >     Reviewed-by: "J. Bruce Fields" <bfields@redhat.com>
> >     Cc: stable@vger.kernel.org
> >     Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
> >     Link: https://lore.kernel.org/r/20230919081824.1096619-1-max.kellermann@ionos.com
> >     Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> > 
> > diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
> > index 0c5a79c3b5d4..ef4c19e5f570 100644
> > --- a/fs/ext4/acl.h
> > +++ b/fs/ext4/acl.h
> > @@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
> >  static inline int
> >  ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
> >  {
> > +	/* usually, the umask is applied by posix_acl_create(), but if
> > +	   ext4 ACL support is disabled at compile time, we need to do
> > +	   it here, because posix_acl_create() will never be called */
> > +	inode->i_mode &= ~current_umask();
> > +
> >  	return 0;
> >  }
> >  #endif  /* CONFIG_EXT4_FS_POSIX_ACL */
Max Kellermann March 14, 2024, 1:08 p.m. UTC | #24
On Wed, Mar 13, 2024 at 9:39 PM Michael Forney <mforney@mforney.org> wrote:
> Turns out that symlinks are inheriting umask on my system (which
> has CONFIG_EXT4_FS_POSIX_ACL=n):
>
> $ umask 022
> $ ln -s target symlink
> $ ls -l symlink
> lrwxr-xr-x    1 michael  michael           6 Mar 13 13:28 symlink -> target
> $
>
> Looking at the referenced functions, posix_acl_create() returns
> early before applying umask for symlinks, but ext4_init_acl() now
> applies the umask unconditionally.

Indeed, I forgot to exclude symlinks from this - sorry for the breakage.

> After reverting this commit, it works correctly. I am also unable
> to reproduce the mentioned issue with O_TMPFILE after reverting the
> commit. It seems that the bug was fixed properly in ac6800e279a2
> ('fs: Add missing umask strip in vfs_tmpfile'), and all branches
> that have this ext4_init_acl patch already had ac6800e279a2 backported.

I can post a patch that adds the missing check or a revert - what do
the FS maintainers prefer?

(There was a bug with O_TMPFILE ignoring umasks years ago - I first
posted the patch in 2018 or so - but by the time my patch actually got
merged, the bug had already been fixed somewhere else IIRC.)

Max
Christian Brauner March 15, 2024, 1:52 p.m. UTC | #25
On Thu, Mar 14, 2024 at 02:08:04PM +0100, Max Kellermann wrote:
> On Wed, Mar 13, 2024 at 9:39 PM Michael Forney <mforney@mforney.org> wrote:
> > Turns out that symlinks are inheriting umask on my system (which
> > has CONFIG_EXT4_FS_POSIX_ACL=n):
> >
> > $ umask 022
> > $ ln -s target symlink
> > $ ls -l symlink
> > lrwxr-xr-x    1 michael  michael           6 Mar 13 13:28 symlink -> target
> > $
> >
> > Looking at the referenced functions, posix_acl_create() returns
> > early before applying umask for symlinks, but ext4_init_acl() now
> > applies the umask unconditionally.
> 
> Indeed, I forgot to exclude symlinks from this - sorry for the breakage.
> 
> > After reverting this commit, it works correctly. I am also unable
> > to reproduce the mentioned issue with O_TMPFILE after reverting the
> > commit. It seems that the bug was fixed properly in ac6800e279a2
> > ('fs: Add missing umask strip in vfs_tmpfile'), and all branches
> > that have this ext4_init_acl patch already had ac6800e279a2 backported.
> 
> I can post a patch that adds the missing check or a revert - what do
> the FS maintainers prefer?

If it works correctly with a revert we should remove the code rather
than adding more code to handle a special case.

> 
> (There was a bug with O_TMPFILE ignoring umasks years ago - I first
> posted the patch in 2018 or so - but by the time my patch actually got
> merged, the bug had already been fixed somewhere else IIRC.)

Yeah, we fixed it a while ago and then I added generic VFS level umask
handling but POSIX ACL are hurting us because they're a massive layering
violation on that front.
diff mbox series

Patch

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51c7f2b14f6f..e7e2f264acf4 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1194,6 +1194,7 @@  static inline void ceph_forget_all_cached_acls(struct inode *inode)
 static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 				     struct ceph_acl_sec_ctx *as_ctx)
 {
+	*mode &= ~current_umask();
 	return 0;
 }
 static inline void ceph_init_inode_acls(struct inode *inode,
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 4a8443a2b8ec..694af789c614 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -67,6 +67,7 @@  extern int ext2_init_acl (struct inode *, struct inode *);
 
 static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
 {
+	inode->i_mode &= ~current_umask();
 	return 0;
 }
 #endif
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index f892e54d0fcd..10791e97a46f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -17,6 +17,7 @@  int jfs_init_acl(tid_t, struct inode *, struct inode *);
 static inline int jfs_init_acl(tid_t tid, struct inode *inode,
 			       struct inode *dir)
 {
+	inode->i_mode &= ~current_umask();
 	return 0;
 }
 
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0e65b3d634d9..54bc9b1061ca 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -128,6 +128,7 @@  static inline void cache_no_acl(struct inode *inode)
 static inline int posix_acl_create(struct inode *inode, umode_t *mode,
 		struct posix_acl **default_acl, struct posix_acl **acl)
 {
+	*mode &= ~current_umask();
 	*default_acl = *acl = NULL;
 	return 0;
 }