Message ID | 20220927120857.639461-1-max.kellermann@ionos.com |
---|---|
State | New |
Headers | show |
Series | fs/ceph/super: add mount options "snapdir{mode,uid,gid}" | expand |
Hi Max, Sorry for late and just back from a long vocation. On 27/09/2022 20:08, Max Kellermann wrote: > By default, the ".snap" directory inherits the parent's permissions > and ownership, which allows all users to create new cephfs snapshots > in arbitrary directories they have write access on. > > In some environments, giving everybody this capability is not > desirable, but there is currently no way to disallow only some users > to create snapshots. It is only possible to revoke the permission to > the whole client (i.e. all users on the computer which mounts the > cephfs). > > This patch allows overriding the permissions and ownership of all > virtual ".snap" directories in a cephfs mount, which allows > restricting (read and write) access to snapshots. > > For example, the mount options: > > snapdirmode=0751,snapdiruid=0,snapdirgid=4 > > ... allows only user "root" to create or delete snapshots, and group > "adm" (gid=4) is allowed to get a list of snapshots. All others are > allowed to read the contents of existing snapshots (if they know the > name). I don't think this is a good place to implement this in client side. Should this be a feature in cephx. With this for the same directories in different mounts will behave differently. Isn't that odd ? -- Xiubo > Signed-off-by: Max Kellermann <max.kellermann@ionos.com> > --- > fs/ceph/inode.c | 7 ++++--- > fs/ceph/super.c | 33 +++++++++++++++++++++++++++++++++ > fs/ceph/super.h | 4 ++++ > 3 files changed, 41 insertions(+), 3 deletions(-) > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > index 56c53ab3618e..0e9388af2821 100644 > --- a/fs/ceph/inode.c > +++ b/fs/ceph/inode.c > @@ -80,6 +80,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) > }; > struct inode *inode = ceph_get_inode(parent->i_sb, vino); > struct ceph_inode_info *ci = ceph_inode(inode); > + struct ceph_mount_options *const fsopt = ceph_inode_to_client(parent)->mount_options; > > if (IS_ERR(inode)) > return inode; > @@ -96,9 +97,9 @@ struct inode *ceph_get_snapdir(struct inode *parent) > goto err; > } > > - inode->i_mode = parent->i_mode; > - inode->i_uid = parent->i_uid; > - inode->i_gid = parent->i_gid; > + inode->i_mode = fsopt->snapdir_mode == (umode_t)-1 ? parent->i_mode : fsopt->snapdir_mode; > + inode->i_uid = uid_eq(fsopt->snapdir_uid, INVALID_UID) ? parent->i_uid : fsopt->snapdir_uid; > + inode->i_gid = gid_eq(fsopt->snapdir_gid, INVALID_GID) ? parent->i_gid : fsopt->snapdir_gid; > inode->i_mtime = parent->i_mtime; > inode->i_ctime = parent->i_ctime; > inode->i_atime = parent->i_atime; > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index 40140805bdcf..5e5713946f7b 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -143,6 +143,9 @@ enum { > Opt_readdir_max_entries, > Opt_readdir_max_bytes, > Opt_congestion_kb, > + Opt_snapdirmode, > + Opt_snapdiruid, > + Opt_snapdirgid, > /* int args above */ > Opt_snapdirname, > Opt_mds_namespace, > @@ -200,6 +203,9 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { > fsparam_flag_no ("require_active_mds", Opt_require_active_mds), > fsparam_u32 ("rsize", Opt_rsize), > fsparam_string ("snapdirname", Opt_snapdirname), > + fsparam_u32oct ("snapdirmode", Opt_snapdirmode), > + fsparam_u32 ("snapdiruid", Opt_snapdiruid), > + fsparam_u32 ("snapdirgid", Opt_snapdirgid), > fsparam_string ("source", Opt_source), > fsparam_string ("mon_addr", Opt_mon_addr), > fsparam_u32 ("wsize", Opt_wsize), > @@ -414,6 +420,22 @@ static int ceph_parse_mount_param(struct fs_context *fc, > fsopt->snapdir_name = param->string; > param->string = NULL; > break; > + case Opt_snapdirmode: > + fsopt->snapdir_mode = result.uint_32; > + if (fsopt->snapdir_mode & ~0777) > + return invalfc(fc, "Invalid snapdirmode"); > + fsopt->snapdir_mode |= S_IFDIR; > + break; > + case Opt_snapdiruid: > + fsopt->snapdir_uid = make_kuid(current_user_ns(), result.uint_32); > + if (!uid_valid(fsopt->snapdir_uid)) > + return invalfc(fc, "Invalid snapdiruid"); > + break; > + case Opt_snapdirgid: > + fsopt->snapdir_gid = make_kgid(current_user_ns(), result.uint_32); > + if (!gid_valid(fsopt->snapdir_gid)) > + return invalfc(fc, "Invalid snapdirgid"); > + break; > case Opt_mds_namespace: > if (!namespace_equals(fsopt, param->string, strlen(param->string))) > return invalfc(fc, "Mismatching mds_namespace"); > @@ -734,6 +756,14 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) > seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); > if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) > seq_show_option(m, "snapdirname", fsopt->snapdir_name); > + if (fsopt->snapdir_mode != (umode_t)-1) > + seq_printf(m, ",snapdirmode=%o", fsopt->snapdir_mode); > + if (!uid_eq(fsopt->snapdir_uid, INVALID_UID)) > + seq_printf(m, ",snapdiruid=%o", > + from_kuid_munged(&init_user_ns, fsopt->snapdir_uid)); > + if (!gid_eq(fsopt->snapdir_gid, INVALID_GID)) > + seq_printf(m, ",snapdirgid=%o", > + from_kgid_munged(&init_user_ns, fsopt->snapdir_gid)); > > return 0; > } > @@ -1335,6 +1365,9 @@ static int ceph_init_fs_context(struct fs_context *fc) > fsopt->wsize = CEPH_MAX_WRITE_SIZE; > fsopt->rsize = CEPH_MAX_READ_SIZE; > fsopt->rasize = CEPH_RASIZE_DEFAULT; > + fsopt->snapdir_mode = (umode_t)-1; > + fsopt->snapdir_uid = INVALID_UID; > + fsopt->snapdir_gid = INVALID_GID; > fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); > if (!fsopt->snapdir_name) > goto nomem; > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index d44a366b2f1b..3c930816078d 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -85,6 +85,10 @@ struct ceph_mount_options { > unsigned int max_readdir; /* max readdir result (entries) */ > unsigned int max_readdir_bytes; /* max readdir result (bytes) */ > > + umode_t snapdir_mode; > + kuid_t snapdir_uid; > + kgid_t snapdir_gid; > + > bool new_dev_syntax; > > /*
On Sun, Oct 9, 2022 at 8:23 AM Xiubo Li <xiubli@redhat.com> wrote: > I don't think this is a good place to implement this in client side. > Should this be a feature in cephx. What's cephx? "git grep cephx" didn't reveal anything that looked useful to me. > With this for the same directories in different mounts will behave > differently. Isn't that odd ? Just like different mounts can have different snapdir names currently. That's just as odd, and I tried to imitate what's already there. I don't have an opinion on how this should be implemented, all I want is restrict who gets access to cephfs snapshots. Please explain how you want it, and I'll send an amended patch.
On Sun, Oct 9, 2022 at 10:43 AM Xiubo Li <xiubli@redhat.com> wrote: > I mean CEPHFS CLIENT CAPABILITIES [1]. I know that, but that's suitable for me. This is client-specific, not user (uid/gid) specific. In my use case, a server can run unprivileged user processes which should not be able create snapshots for their own home directory, and ideally they should not even be able to traverse into the ".snap" directory and access the snapshots created of their home directory. Other (non-superuser) system processes however should be able to manage snapshots. It should be possible to bind-mount snapshots into the user's mount namespace. All of that is possible with my patch, but impossible with your suggestion. The client-specific approach is all-or-nothing (unless I miss something vital). > The snapdir name is a different case. But this is only about the snapdir. The snapdir does not exist on the server, it is synthesized on the client (in the Linux kernel cephfs code). > But your current approach will introduce issues when an UID/GID is reused after an user/groud is deleted ? The UID I would specify is one which exists on the client, for a dedicated system user whose purpose is to manage cephfs snapshots of all users. The UID is created when the machine is installed, and is never deleted. > Maybe the proper approach is the posix acl. Then by default the .snap dir will inherit the permission from its parent and you can change it as you wish. This permission could be spread to all the other clients too ? No, that would be impractical and unreliable. Impractical because it would require me to walk the whole filesystem tree and let the kernel synthesize the snapdir inode for all directories and change its ACL; impractical because walking millions of directories takes longer than I am willing to wait. Unreliable because there would be race problems when another client (or even the local client) creates a new directory. Until my local "snapdir ACL daemon" learns about the existence of the new directory and is able to update its ACL, the user can already have messed with it. Both of that is not a problem with my patch.
On Sun, Oct 9, 2022 at 12:27 PM Max Kellermann <max.kellermann@ionos.com> wrote:
> I know that, but that's suitable for me.
Typo: that's NOT suitable for me.
On 09/10/2022 18:27, Max Kellermann wrote: > On Sun, Oct 9, 2022 at 10:43 AM Xiubo Li <xiubli@redhat.com> wrote: >> I mean CEPHFS CLIENT CAPABILITIES [1]. > I know that, but that's suitable for me. This is client-specific, not > user (uid/gid) specific. > > In my use case, a server can run unprivileged user processes which > should not be able create snapshots for their own home directory, and > ideally they should not even be able to traverse into the ".snap" > directory and access the snapshots created of their home directory. > Other (non-superuser) system processes however should be able to > manage snapshots. It should be possible to bind-mount snapshots into > the user's mount namespace. > > All of that is possible with my patch, but impossible with your > suggestion. The client-specific approach is all-or-nothing (unless I > miss something vital). > >> The snapdir name is a different case. > But this is only about the snapdir. The snapdir does not exist on the > server, it is synthesized on the client (in the Linux kernel cephfs > code). This could be applied to it's parent dir instead as one metadata in mds side and in client side it will be transfer to snapdir's metadata, just like what the snapshots. But just ignore this approach. >> But your current approach will introduce issues when an UID/GID is reused after an user/groud is deleted ? > The UID I would specify is one which exists on the client, for a > dedicated system user whose purpose is to manage cephfs snapshots of > all users. The UID is created when the machine is installed, and is > never deleted. This is an ideal use case IMO. I googled about reusing the UID/GID issues and found someone has hit a similar issue in their use case. >> Maybe the proper approach is the posix acl. Then by default the .snap dir will inherit the permission from its parent and you can change it as you wish. This permission could be spread to all the other clients too ? > No, that would be impractical and unreliable. > Impractical because it would require me to walk the whole filesystem > tree and let the kernel synthesize the snapdir inode for all > directories and change its ACL; No, it don't have to. This could work simply as the snaprealm hierarchy thing in kceph. Only the up top directory need to record the ACL and all the descendants will point and use it if they don't have their own ACLs. > impractical because walking millions > of directories takes longer than I am willing to wait. > Unreliable because there would be race problems when another client > (or even the local client) creates a new directory. Until my local > "snapdir ACL daemon" learns about the existence of the new directory > and is able to update its ACL, the user can already have messed with > it. For multiple clients case I think the cephfs capabilities [3] could guarantee the consistency of this. While for the single client case if before the user could update its ACL just after creating it someone else has changed it or messed it up, then won't the existing ACLs have the same issue ? [3] https://docs.ceph.com/en/quincy/cephfs/capabilities/ > Both of that is not a problem with my patch. > Jeff, Any idea ? Thanks! - Xiubo
On Mon, Oct 10, 2022 at 4:03 AM Xiubo Li <xiubli@redhat.com> wrote: > No, it don't have to. This could work simply as the snaprealm hierarchy > thing in kceph. > > Only the up top directory need to record the ACL and all the descendants > will point and use it if they don't have their own ACLs. Not knowing much about Ceph internals, I don't quite understand this idea. Until somebody implements that, I'll keep my patch in our Linux kernel fork. My patch is a very simple and robust fix for our problem (it's already in production use). It's okay for me if it doesn't get merged into mainline, but at least I wanted to share it. > For multiple clients case I think the cephfs capabilities [3] could > guarantee the consistency of this. I don't understand - capabilities are client-specific, not user-specific. My problem is a user-specific one. > While for the single client case if > before the user could update its ACL just after creating it someone else > has changed it or messed it up, then won't the existing ACLs have the > same issue ? You mean ACLs for "real" files/directories? Sure, some care needs to be taken, e.g. create directories with mode 700 and chmod after setting the ACL, and using O_TMPFILE for files and linking them to a directory only after updating the ACL. The difference to snapdir is that the snapdir doesn't actually exist anywhere; it is synthesized by the client only on the first acess, and may be discarded any time by the shrinker, and the next access creates a new one with default permissions. All those snapdir inodes are local to the client, and each client has its own private set of permissions for it. Even if you'd take care for updating the snapdir permissions after creating a directory, that would only be visible on that one client, and may be reverted at any arbitrary point in time, and there's nothing you can do about it. That's two unsolvable problems (no client synchronization of snapdir permissions, and no persistence). Max
On Tue, Oct 11, 2022 at 11:27 AM Luís Henriques <lhenriques@suse.de> wrote: > I'm not really sure where (or how) exactly this feature should be > implemented, but I agree with Xiubo that this approach doesn't look > correct. This would be yet another hack that can be easily circumvented > on the client side. If this feature is really required, the restriction > should be imposed on the MDS side. I think this is the wrong way to look at this problem. This is like saying the client can circumvent file permissions by pretending to be the owning uid. This isn't about restricting a client to see/create/delete snapshots; we need to agree that this is about clients which already have the permission to do this. This is about restricting which users (uids) on a client will be allowed to do that. The server cannot possibly implement this restriction; all it can do is give kind hints to the client, like it currently does with regular file permissions. But it's up to the client's kernel (the cephfs driver and the VFS) to enforce those per-uid permissions. This approach is already implemented in cephfs currently. All my patch does is allow overriding the mode mask, instead of inheriting the parent's mode mask. So if you say "this approach doesn't look correct", then you're actually saying that cephfs which is shipped in Linux mainline kernels isn't correct. That is certainly a reasonable opinion, but it's not about my patch. > (However, IMHO the feature looks odd from the beginning: a user that owns > a directory and has 'rw' access to it but can't run a simple 'mkdir' is > probably breaking standards compliance even more.) There is "prior art" for that: read-only mounts and immutable files. On both, you can have the "w" permission bit but cannot actually write. Max
On Mon, 2022-10-10 at 10:02 +0800, Xiubo Li wrote: > On 09/10/2022 18:27, Max Kellermann wrote: > > On Sun, Oct 9, 2022 at 10:43 AM Xiubo Li <xiubli@redhat.com> wrote: > > > I mean CEPHFS CLIENT CAPABILITIES [1]. > > I know that, but that's suitable for me. This is client-specific, not > > user (uid/gid) specific. > > > > In my use case, a server can run unprivileged user processes which > > should not be able create snapshots for their own home directory, and > > ideally they should not even be able to traverse into the ".snap" > > directory and access the snapshots created of their home directory. > > Other (non-superuser) system processes however should be able to > > manage snapshots. It should be possible to bind-mount snapshots into > > the user's mount namespace. > > > > All of that is possible with my patch, but impossible with your > > suggestion. The client-specific approach is all-or-nothing (unless I > > miss something vital). > > > > > The snapdir name is a different case. > > But this is only about the snapdir. The snapdir does not exist on the > > server, it is synthesized on the client (in the Linux kernel cephfs > > code). > > This could be applied to it's parent dir instead as one metadata in mds > side and in client side it will be transfer to snapdir's metadata, just > like what the snapshots. > > But just ignore this approach. > > > > But your current approach will introduce issues when an UID/GID is reused after an user/groud is deleted ? > > The UID I would specify is one which exists on the client, for a > > dedicated system user whose purpose is to manage cephfs snapshots of > > all users. The UID is created when the machine is installed, and is > > never deleted. > > This is an ideal use case IMO. > > I googled about reusing the UID/GID issues and found someone has hit a > similar issue in their use case. > This is always a danger and not just with ceph. The solution to that is good sysadmin practices (i.e. don't reuse uid/gid values without sanitizing the filesystems first). > > > Maybe the proper approach is the posix acl. Then by default the .snap dir will inherit the permission from its parent and you can change it as you wish. This permission could be spread to all the other clients too ? > > No, that would be impractical and unreliable. > > Impractical because it would require me to walk the whole filesystem > > tree and let the kernel synthesize the snapdir inode for all > > directories and change its ACL; > > No, it don't have to. This could work simply as the snaprealm hierarchy > thing in kceph. > > Only the up top directory need to record the ACL and all the descendants > will point and use it if they don't have their own ACLs. > > > impractical because walking millions > > of directories takes longer than I am willing to wait. > > Unreliable because there would be race problems when another client > > (or even the local client) creates a new directory. Until my local > > "snapdir ACL daemon" learns about the existence of the new directory > > and is able to update its ACL, the user can already have messed with > > it. > > For multiple clients case I think the cephfs capabilities [3] could > guarantee the consistency of this. While for the single client case if > before the user could update its ACL just after creating it someone else > has changed it or messed it up, then won't the existing ACLs have the > same issue ? > > [3] https://docs.ceph.com/en/quincy/cephfs/capabilities/ > > > > Both of that is not a problem with my patch. > > > Jeff, > > Any idea ? > I tend to agree with Max here. The .snap dir is a client-side fiction, so trying to do something on the MDS to govern its use seems a bit odd. cephx is really about authenticating clients. I know we do things like enforce root squashing on the MDS, but this is a little different. Now, all of that said, snapshot handling is an area where I'm just not that knowledgeable. Feel free to ignore my opinion here as uninformed.
On 11/10/2022 18:45, Jeff Layton wrote: > On Mon, 2022-10-10 at 10:02 +0800, Xiubo Li wrote: >> On 09/10/2022 18:27, Max Kellermann wrote: >>> On Sun, Oct 9, 2022 at 10:43 AM Xiubo Li <xiubli@redhat.com> wrote: >>>> I mean CEPHFS CLIENT CAPABILITIES [1]. >>> I know that, but that's suitable for me. This is client-specific, not >>> user (uid/gid) specific. >>> >>> In my use case, a server can run unprivileged user processes which >>> should not be able create snapshots for their own home directory, and >>> ideally they should not even be able to traverse into the ".snap" >>> directory and access the snapshots created of their home directory. >>> Other (non-superuser) system processes however should be able to >>> manage snapshots. It should be possible to bind-mount snapshots into >>> the user's mount namespace. >>> >>> All of that is possible with my patch, but impossible with your >>> suggestion. The client-specific approach is all-or-nothing (unless I >>> miss something vital). >>> >>>> The snapdir name is a different case. >>> But this is only about the snapdir. The snapdir does not exist on the >>> server, it is synthesized on the client (in the Linux kernel cephfs >>> code). >> This could be applied to it's parent dir instead as one metadata in mds >> side and in client side it will be transfer to snapdir's metadata, just >> like what the snapshots. >> >> But just ignore this approach. >> >>>> But your current approach will introduce issues when an UID/GID is reused after an user/groud is deleted ? >>> The UID I would specify is one which exists on the client, for a >>> dedicated system user whose purpose is to manage cephfs snapshots of >>> all users. The UID is created when the machine is installed, and is >>> never deleted. >> This is an ideal use case IMO. >> >> I googled about reusing the UID/GID issues and found someone has hit a >> similar issue in their use case. >> > This is always a danger and not just with ceph. The solution to that is > good sysadmin practices (i.e. don't reuse uid/gid values without > sanitizing the filesystems first). Yeah, this sounds reasonable. >>>> Maybe the proper approach is the posix acl. Then by default the .snap dir will inherit the permission from its parent and you can change it as you wish. This permission could be spread to all the other clients too ? >>> No, that would be impractical and unreliable. >>> Impractical because it would require me to walk the whole filesystem >>> tree and let the kernel synthesize the snapdir inode for all >>> directories and change its ACL; >> No, it don't have to. This could work simply as the snaprealm hierarchy >> thing in kceph. >> >> Only the up top directory need to record the ACL and all the descendants >> will point and use it if they don't have their own ACLs. >> >>> impractical because walking millions >>> of directories takes longer than I am willing to wait. >>> Unreliable because there would be race problems when another client >>> (or even the local client) creates a new directory. Until my local >>> "snapdir ACL daemon" learns about the existence of the new directory >>> and is able to update its ACL, the user can already have messed with >>> it. >> For multiple clients case I think the cephfs capabilities [3] could >> guarantee the consistency of this. While for the single client case if >> before the user could update its ACL just after creating it someone else >> has changed it or messed it up, then won't the existing ACLs have the >> same issue ? >> >> [3] https://docs.ceph.com/en/quincy/cephfs/capabilities/ >> >> >>> Both of that is not a problem with my patch. >>> >> Jeff, >> >> Any idea ? >> > I tend to agree with Max here. The .snap dir is a client-side fiction, > so trying to do something on the MDS to govern its use seems a bit odd. > cephx is really about authenticating clients. I know we do things like > enforce root squashing on the MDS, but this is a little different. > > Now, all of that said, snapshot handling is an area where I'm just not > that knowledgeable. Feel free to ignore my opinion here as uninformed. I am thinking currently the cephfs have the same issue we discussed here. Because the cephfs is saving the UID/GID number in the CInode metedata. While when there have multiple clients are sharing the same cephfs, so in different client nodes another user could cross access a specified user's files. For example: In client nodeA: user1's UID is 123, user2's UID is 321. In client nodeB: user1's UID is 321, user2's UID is 123. And if user1 create a fileA in the client nodeA, then user2 could access it from client nodeB. Doesn't this also sound more like a client-side fiction ? - Xiubo
On Thu, 2022-10-20 at 09:29 +0800, Xiubo Li wrote: > On 11/10/2022 18:45, Jeff Layton wrote: > > On Mon, 2022-10-10 at 10:02 +0800, Xiubo Li wrote: > > > On 09/10/2022 18:27, Max Kellermann wrote: > > > > On Sun, Oct 9, 2022 at 10:43 AM Xiubo Li <xiubli@redhat.com> wrote: > > > > > I mean CEPHFS CLIENT CAPABILITIES [1]. > > > > I know that, but that's suitable for me. This is client-specific, not > > > > user (uid/gid) specific. > > > > > > > > In my use case, a server can run unprivileged user processes which > > > > should not be able create snapshots for their own home directory, and > > > > ideally they should not even be able to traverse into the ".snap" > > > > directory and access the snapshots created of their home directory. > > > > Other (non-superuser) system processes however should be able to > > > > manage snapshots. It should be possible to bind-mount snapshots into > > > > the user's mount namespace. > > > > > > > > All of that is possible with my patch, but impossible with your > > > > suggestion. The client-specific approach is all-or-nothing (unless I > > > > miss something vital). > > > > > > > > > The snapdir name is a different case. > > > > But this is only about the snapdir. The snapdir does not exist on the > > > > server, it is synthesized on the client (in the Linux kernel cephfs > > > > code). > > > This could be applied to it's parent dir instead as one metadata in mds > > > side and in client side it will be transfer to snapdir's metadata, just > > > like what the snapshots. > > > > > > But just ignore this approach. > > > > > > > > But your current approach will introduce issues when an UID/GID is reused after an user/groud is deleted ? > > > > The UID I would specify is one which exists on the client, for a > > > > dedicated system user whose purpose is to manage cephfs snapshots of > > > > all users. The UID is created when the machine is installed, and is > > > > never deleted. > > > This is an ideal use case IMO. > > > > > > I googled about reusing the UID/GID issues and found someone has hit a > > > similar issue in their use case. > > > > > This is always a danger and not just with ceph. The solution to that is > > good sysadmin practices (i.e. don't reuse uid/gid values without > > sanitizing the filesystems first). > > Yeah, this sounds reasonable. > > > > > > Maybe the proper approach is the posix acl. Then by default the .snap dir will inherit the permission from its parent and you can change it as you wish. This permission could be spread to all the other clients too ? > > > > No, that would be impractical and unreliable. > > > > Impractical because it would require me to walk the whole filesystem > > > > tree and let the kernel synthesize the snapdir inode for all > > > > directories and change its ACL; > > > No, it don't have to. This could work simply as the snaprealm hierarchy > > > thing in kceph. > > > > > > Only the up top directory need to record the ACL and all the descendants > > > will point and use it if they don't have their own ACLs. > > > > > > > impractical because walking millions > > > > of directories takes longer than I am willing to wait. > > > > Unreliable because there would be race problems when another client > > > > (or even the local client) creates a new directory. Until my local > > > > "snapdir ACL daemon" learns about the existence of the new directory > > > > and is able to update its ACL, the user can already have messed with > > > > it. > > > For multiple clients case I think the cephfs capabilities [3] could > > > guarantee the consistency of this. While for the single client case if > > > before the user could update its ACL just after creating it someone else > > > has changed it or messed it up, then won't the existing ACLs have the > > > same issue ? > > > > > > [3] https://docs.ceph.com/en/quincy/cephfs/capabilities/ > > > > > > > > > > Both of that is not a problem with my patch. > > > > > > > Jeff, > > > > > > Any idea ? > > > > > I tend to agree with Max here. The .snap dir is a client-side fiction, > > so trying to do something on the MDS to govern its use seems a bit odd. > > cephx is really about authenticating clients. I know we do things like > > enforce root squashing on the MDS, but this is a little different. > > > > Now, all of that said, snapshot handling is an area where I'm just not > > that knowledgeable. Feel free to ignore my opinion here as uninformed. > > I am thinking currently the cephfs have the same issue we discussed > here. Because the cephfs is saving the UID/GID number in the CInode > metedata. While when there have multiple clients are sharing the same > cephfs, so in different client nodes another user could cross access a > specified user's files. For example: > > In client nodeA: > > user1's UID is 123, user2's UID is 321. > > In client nodeB: > > user1's UID is 321, user2's UID is 123. > > And if user1 create a fileA in the client nodeA, then user2 could access > it from client nodeB. > > Doesn't this also sound more like a client-side fiction ? > idmapping is a difficult issue and not at all confined to CephFS. NFSv4 has a whole upcall facility for mapping IDs, for instance. The MDS has no way to know that 123 and 321 are the same user on different machines. That sort of mapping must be set up by the administrator. The real question is: Does it make sense for the MDS to use directory permissions to enforce access on something that isn't really a directory? My "gut feeling" here is that the MDS ought to be in charge of governing which _clients_ are allowed to make snapshots, but it's up to the client to determine which _users_ are allowed to create them. With that concept in mind, Max's proposal makes some sense. Snapshots are not part of POSIX, and the ".snap" directory interface was copied from Netapp (AFAICT). Maybe CephFS ought to enforce permissions on snapshots the same way Netapps do? I don't know exactly how it works there, so some research may be required. I found this article but it's behind a paywall: https://kb.netapp.com/Advice_and_Troubleshooting/Data_Storage_Software/ONTAP_OS_7_Mode/How_to_control_access_to_a_Snapshot_directory
On 20/10/2022 18:13, Jeff Layton wrote: > On Thu, 2022-10-20 at 09:29 +0800, Xiubo Li wrote: [...] >>> I tend to agree with Max here. The .snap dir is a client-side fiction, >>> so trying to do something on the MDS to govern its use seems a bit odd. >>> cephx is really about authenticating clients. I know we do things like >>> enforce root squashing on the MDS, but this is a little different. >>> >>> Now, all of that said, snapshot handling is an area where I'm just not >>> that knowledgeable. Feel free to ignore my opinion here as uninformed. >> I am thinking currently the cephfs have the same issue we discussed >> here. Because the cephfs is saving the UID/GID number in the CInode >> metedata. While when there have multiple clients are sharing the same >> cephfs, so in different client nodes another user could cross access a >> specified user's files. For example: >> >> In client nodeA: >> >> user1's UID is 123, user2's UID is 321. >> >> In client nodeB: >> >> user1's UID is 321, user2's UID is 123. >> >> And if user1 create a fileA in the client nodeA, then user2 could access >> it from client nodeB. >> >> Doesn't this also sound more like a client-side fiction ? >> > idmapping is a difficult issue and not at all confined to CephFS. NFSv4 > has a whole upcall facility for mapping IDs, for instance. The MDS has > no way to know that 123 and 321 are the same user on different machines. > That sort of mapping must be set up by the administrator. > > The real question is: Does it make sense for the MDS to use directory > permissions to enforce access on something that isn't really a > directory? > > My "gut feeling" here is that the MDS ought to be in charge of governing > which _clients_ are allowed to make snapshots, but it's up to the client > to determine which _users_ are allowed to create them. With that concept > in mind, Max's proposal makes some sense. > > Snapshots are not part of POSIX, and the ".snap" directory interface was > copied from Netapp (AFAICT). Maybe CephFS ought to enforce permissions > on snapshots the same way Netapps do? I don't know exactly how it works > there, so some research may be required. > > I found this article but it's behind a paywall: > > https://kb.netapp.com/Advice_and_Troubleshooting/Data_Storage_Software/ONTAP_OS_7_Mode/How_to_control_access_to_a_Snapshot_directory > Yeah, these days I thought about this more. I will review this patch. BTW, as we discussed in IRC to switch this to module parameters instead. Then how about when the mounts are in different namespace groups, such as the containers ? This couldn't be control that precise, maybe we will hit the same idmapping issue as I mentioned above ? So maybe the mount option approach is the best choice here as Max does ? - Xiubo
On 20/10/2022 18:13, Jeff Layton wrote: > On Thu, 2022-10-20 at 09:29 +0800, Xiubo Li wrote: >> On 11/10/2022 18:45, Jeff Layton wrote: >>> On Mon, 2022-10-10 at 10:02 +0800, Xiubo Li wrote: >>>> On 09/10/2022 18:27, Max Kellermann wrote: >>>>> On Sun, Oct 9, 2022 at 10:43 AM Xiubo Li <xiubli@redhat.com> wrote: >>>>>> I mean CEPHFS CLIENT CAPABILITIES [1]. >>>>> I know that, but that's suitable for me. This is client-specific, not >>>>> user (uid/gid) specific. >>>>> >>>>> In my use case, a server can run unprivileged user processes which >>>>> should not be able create snapshots for their own home directory, and >>>>> ideally they should not even be able to traverse into the ".snap" >>>>> directory and access the snapshots created of their home directory. >>>>> Other (non-superuser) system processes however should be able to >>>>> manage snapshots. It should be possible to bind-mount snapshots into >>>>> the user's mount namespace. >>>>> >>>>> All of that is possible with my patch, but impossible with your >>>>> suggestion. The client-specific approach is all-or-nothing (unless I >>>>> miss something vital). >>>>> >>>>>> The snapdir name is a different case. >>>>> But this is only about the snapdir. The snapdir does not exist on the >>>>> server, it is synthesized on the client (in the Linux kernel cephfs >>>>> code). >>>> This could be applied to it's parent dir instead as one metadata in mds >>>> side and in client side it will be transfer to snapdir's metadata, just >>>> like what the snapshots. >>>> >>>> But just ignore this approach. >>>> >>>>>> But your current approach will introduce issues when an UID/GID is reused after an user/groud is deleted ? >>>>> The UID I would specify is one which exists on the client, for a >>>>> dedicated system user whose purpose is to manage cephfs snapshots of >>>>> all users. The UID is created when the machine is installed, and is >>>>> never deleted. >>>> This is an ideal use case IMO. >>>> >>>> I googled about reusing the UID/GID issues and found someone has hit a >>>> similar issue in their use case. >>>> >>> This is always a danger and not just with ceph. The solution to that is >>> good sysadmin practices (i.e. don't reuse uid/gid values without >>> sanitizing the filesystems first). >> Yeah, this sounds reasonable. >> >>>>>> Maybe the proper approach is the posix acl. Then by default the .snap dir will inherit the permission from its parent and you can change it as you wish. This permission could be spread to all the other clients too ? >>>>> No, that would be impractical and unreliable. >>>>> Impractical because it would require me to walk the whole filesystem >>>>> tree and let the kernel synthesize the snapdir inode for all >>>>> directories and change its ACL; >>>> No, it don't have to. This could work simply as the snaprealm hierarchy >>>> thing in kceph. >>>> >>>> Only the up top directory need to record the ACL and all the descendants >>>> will point and use it if they don't have their own ACLs. >>>> >>>>> impractical because walking millions >>>>> of directories takes longer than I am willing to wait. >>>>> Unreliable because there would be race problems when another client >>>>> (or even the local client) creates a new directory. Until my local >>>>> "snapdir ACL daemon" learns about the existence of the new directory >>>>> and is able to update its ACL, the user can already have messed with >>>>> it. >>>> For multiple clients case I think the cephfs capabilities [3] could >>>> guarantee the consistency of this. While for the single client case if >>>> before the user could update its ACL just after creating it someone else >>>> has changed it or messed it up, then won't the existing ACLs have the >>>> same issue ? >>>> >>>> [3] https://docs.ceph.com/en/quincy/cephfs/capabilities/ >>>> >>>> >>>>> Both of that is not a problem with my patch. >>>>> >>>> Jeff, >>>> >>>> Any idea ? >>>> >>> I tend to agree with Max here. The .snap dir is a client-side fiction, >>> so trying to do something on the MDS to govern its use seems a bit odd. >>> cephx is really about authenticating clients. I know we do things like >>> enforce root squashing on the MDS, but this is a little different. >>> >>> Now, all of that said, snapshot handling is an area where I'm just not >>> that knowledgeable. Feel free to ignore my opinion here as uninformed. >> I am thinking currently the cephfs have the same issue we discussed >> here. Because the cephfs is saving the UID/GID number in the CInode >> metedata. While when there have multiple clients are sharing the same >> cephfs, so in different client nodes another user could cross access a >> specified user's files. For example: >> >> In client nodeA: >> >> user1's UID is 123, user2's UID is 321. >> >> In client nodeB: >> >> user1's UID is 321, user2's UID is 123. >> >> And if user1 create a fileA in the client nodeA, then user2 could access >> it from client nodeB. >> >> Doesn't this also sound more like a client-side fiction ? >> > idmapping is a difficult issue and not at all confined to CephFS. NFSv4 > has a whole upcall facility for mapping IDs, for instance. The MDS has > no way to know that 123 and 321 are the same user on different machines. > That sort of mapping must be set up by the administrator. > > The real question is: Does it make sense for the MDS to use directory > permissions to enforce access on something that isn't really a > directory? > > My "gut feeling" here is that the MDS ought to be in charge of governing > which _clients_ are allowed to make snapshots, but it's up to the client > to determine which _users_ are allowed to create them. With that concept > in mind, Max's proposal makes some sense. > > Snapshots are not part of POSIX, and the ".snap" directory interface was > copied from Netapp (AFAICT). Maybe CephFS ought to enforce permissions > on snapshots the same way Netapps do? I don't know exactly how it works > there, so some research may be required. > > I found this article but it's behind a paywall: > > https://kb.netapp.com/Advice_and_Troubleshooting/Data_Storage_Software/ONTAP_OS_7_Mode/How_to_control_access_to_a_Snapshot_directory > Patrick provode a better idea for this feature. Currently cephx permission has already supported the 's' permission, which means you can do the snapshot create/remove. And for a privileged or specific mounts you can give them the 's' permission and then only they can do the snapshot create/remove. And all the others won't. And then use the container or something else to make the specific users could access to them. Thanks! - Xiubo
On Tue, Oct 25, 2022 at 3:36 AM Xiubo Li <xiubli@redhat.com> wrote: > Currently cephx permission has already supported the 's' permission, > which means you can do the snapshot create/remove. And for a privileged > or specific mounts you can give them the 's' permission and then only > they can do the snapshot create/remove. And all the others won't. But that's a client permission, not a user permission. I repeat: the problem is that snapshots should only be accessible/discoverable/creatable by certain users (UIDs/GIDs) on the client machine, independent of their permission on the parent directory. My patch decouples parent directory permissions from snapdir permissions, and it's a simple and elegant solution to my problem. > And then use the container or something else to make the specific users > could access to them. Sorry, I don't get it at all. What is "the container or something" and how does it enable me to prevent specific users from accessing snapdirs in their home directories?
On 25/10/2022 15:22, Max Kellermann wrote: > On Tue, Oct 25, 2022 at 3:36 AM Xiubo Li <xiubli@redhat.com> wrote: >> Currently cephx permission has already supported the 's' permission, >> which means you can do the snapshot create/remove. And for a privileged >> or specific mounts you can give them the 's' permission and then only >> they can do the snapshot create/remove. And all the others won't. > But that's a client permission, not a user permission. > > I repeat: the problem is that snapshots should only be > accessible/discoverable/creatable by certain users (UIDs/GIDs) on the > client machine, independent of their permission on the parent > directory. Hi Max, Yeah, the cephx permission could cover this totally and there is no need to worry about the user id mapping issue. You can allow the mount with specific client ids, "client.privileged" for example, could create/remove the snapshots: [client.privileged] key = AQA19uZUqIwkHxAAFuUwvq0eJD4S173oFRxe0g== caps mds = "allow rws /" caps mon = "allow *" caps osd = "allow *" [client.global] key = xE21RuZTqIuiHxFFAuEwv4TjJD3R176BFOi4Fj== caps mds = "allow rw /" caps mon = "allow *" caps osd = "allow *" Then specify the client ids when mounting: $ sudo ./bin/mount.ceph privileged@.a=/ /mnt/privileged/mountpoint $ sudo ./bin/mount.ceph global@.a=/ /mnt/global/mountpoint Just to make sure only certain users, who have permission to create/remove snapshots, could access to the "/mnt/privileged/" directory. I didn't read the openshift code, but when I was debugging the bugs and from the logs I saw it acting similarly to this. > My patch decouples parent directory permissions from snapdir > permissions, and it's a simple and elegant solution to my problem. Yeah, I'm aware of the differences between these two approaches exactly. This should be a common feature not only in kernel client. We also need to implement this in cephfs user space client. If the above cephx permission approach could work very well everywhere, I am afraid this couldn't go to ceph in user space. >> And then use the container or something else to make the specific users >> could access to them. > Sorry, I don't get it at all. What is "the container or something" and > how does it enable me to prevent specific users from accessing > snapdirs in their home directories? > Please see my above example. If that still won't work well, please send one mail in ceph-user to discuss this further, probably we can get more feedbacks from there. Thanks! - Xiubo
On Tue, Oct 25, 2022 at 11:10 AM Xiubo Li <xiubli@redhat.com> wrote: > $ sudo ./bin/mount.ceph privileged@.a=/ /mnt/privileged/mountpoint > > $ sudo ./bin/mount.ceph global@.a=/ /mnt/global/mountpoint So you have two different mount points where different client permissions are used. There are various problems with that architecture: - it complicates administration, because now every mount has to be done twice - it complicates applications accessing ceph (and their configuration), because there are now 2 mount points - it increases resource usage for having twice as many ceph connections - it interferes with fscache, doubling fscache's local disk usage, reducing fscache's efficiency - ownership of the snapdir is still the same as the parent directory, and I can't have non-superuser processes to manage snapshots; all processes mananging snapshots need to have write permission on the parent directory - this is still all-or-nothing; I can't forbid users to list (+r) or access (+x) snapshots All those problems don't exist with my patch.
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 56c53ab3618e..0e9388af2821 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -80,6 +80,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) }; struct inode *inode = ceph_get_inode(parent->i_sb, vino); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *const fsopt = ceph_inode_to_client(parent)->mount_options; if (IS_ERR(inode)) return inode; @@ -96,9 +97,9 @@ struct inode *ceph_get_snapdir(struct inode *parent) goto err; } - inode->i_mode = parent->i_mode; - inode->i_uid = parent->i_uid; - inode->i_gid = parent->i_gid; + inode->i_mode = fsopt->snapdir_mode == (umode_t)-1 ? parent->i_mode : fsopt->snapdir_mode; + inode->i_uid = uid_eq(fsopt->snapdir_uid, INVALID_UID) ? parent->i_uid : fsopt->snapdir_uid; + inode->i_gid = gid_eq(fsopt->snapdir_gid, INVALID_GID) ? parent->i_gid : fsopt->snapdir_gid; inode->i_mtime = parent->i_mtime; inode->i_ctime = parent->i_ctime; inode->i_atime = parent->i_atime; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 40140805bdcf..5e5713946f7b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -143,6 +143,9 @@ enum { Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, + Opt_snapdirmode, + Opt_snapdiruid, + Opt_snapdirgid, /* int args above */ Opt_snapdirname, Opt_mds_namespace, @@ -200,6 +203,9 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_flag_no ("require_active_mds", Opt_require_active_mds), fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), + fsparam_u32oct ("snapdirmode", Opt_snapdirmode), + fsparam_u32 ("snapdiruid", Opt_snapdiruid), + fsparam_u32 ("snapdirgid", Opt_snapdirgid), fsparam_string ("source", Opt_source), fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), @@ -414,6 +420,22 @@ static int ceph_parse_mount_param(struct fs_context *fc, fsopt->snapdir_name = param->string; param->string = NULL; break; + case Opt_snapdirmode: + fsopt->snapdir_mode = result.uint_32; + if (fsopt->snapdir_mode & ~0777) + return invalfc(fc, "Invalid snapdirmode"); + fsopt->snapdir_mode |= S_IFDIR; + break; + case Opt_snapdiruid: + fsopt->snapdir_uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(fsopt->snapdir_uid)) + return invalfc(fc, "Invalid snapdiruid"); + break; + case Opt_snapdirgid: + fsopt->snapdir_gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(fsopt->snapdir_gid)) + return invalfc(fc, "Invalid snapdirgid"); + break; case Opt_mds_namespace: if (!namespace_equals(fsopt, param->string, strlen(param->string))) return invalfc(fc, "Mismatching mds_namespace"); @@ -734,6 +756,14 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_show_option(m, "snapdirname", fsopt->snapdir_name); + if (fsopt->snapdir_mode != (umode_t)-1) + seq_printf(m, ",snapdirmode=%o", fsopt->snapdir_mode); + if (!uid_eq(fsopt->snapdir_uid, INVALID_UID)) + seq_printf(m, ",snapdiruid=%o", + from_kuid_munged(&init_user_ns, fsopt->snapdir_uid)); + if (!gid_eq(fsopt->snapdir_gid, INVALID_GID)) + seq_printf(m, ",snapdirgid=%o", + from_kgid_munged(&init_user_ns, fsopt->snapdir_gid)); return 0; } @@ -1335,6 +1365,9 @@ static int ceph_init_fs_context(struct fs_context *fc) fsopt->wsize = CEPH_MAX_WRITE_SIZE; fsopt->rsize = CEPH_MAX_READ_SIZE; fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_mode = (umode_t)-1; + fsopt->snapdir_uid = INVALID_UID; + fsopt->snapdir_gid = INVALID_GID; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); if (!fsopt->snapdir_name) goto nomem; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d44a366b2f1b..3c930816078d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -85,6 +85,10 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + umode_t snapdir_mode; + kuid_t snapdir_uid; + kgid_t snapdir_gid; + bool new_dev_syntax; /*
By default, the ".snap" directory inherits the parent's permissions and ownership, which allows all users to create new cephfs snapshots in arbitrary directories they have write access on. In some environments, giving everybody this capability is not desirable, but there is currently no way to disallow only some users to create snapshots. It is only possible to revoke the permission to the whole client (i.e. all users on the computer which mounts the cephfs). This patch allows overriding the permissions and ownership of all virtual ".snap" directories in a cephfs mount, which allows restricting (read and write) access to snapshots. For example, the mount options: snapdirmode=0751,snapdiruid=0,snapdirgid=4 ... allows only user "root" to create or delete snapshots, and group "adm" (gid=4) is allowed to get a list of snapshots. All others are allowed to read the contents of existing snapshots (if they know the name). Signed-off-by: Max Kellermann <max.kellermann@ionos.com> --- fs/ceph/inode.c | 7 ++++--- fs/ceph/super.c | 33 +++++++++++++++++++++++++++++++++ fs/ceph/super.h | 4 ++++ 3 files changed, 41 insertions(+), 3 deletions(-)