Message ID | 1685610013-33478-4-git-send-email-akaher@vmware.com |
---|---|
State | New |
Headers | show |
Series | [v3,01/10] tracing: Require all trace events to have a TRACE_SYSTEM | expand |
FYI, all subjects should start with a capital letter: "eventfs: Implement eventfs dir creation functions" On Thu, 1 Jun 2023 14:30:06 +0530 Ajay Kaher <akaher@vmware.com> wrote: > Adding eventfs_file structure which will hold properties of file or dir. > > Adding following functions to add dir in eventfs: > > eventfs_create_events_dir() directly creates events dir with-in "within" is a proper word. > tracing folder. > > eventfs_add_subsystem_dir() adds the information of subsystem_dir to > eventfs and dynamically creates subsystem_dir as and when requires. "as and when requires" does not make sense. > > eventfs_add_dir() adds the information of dir (which is with-in "within" > subsystem_dir) to eventfs and dynamically creates these dir as > and when requires. I'm guessing you want to say: eventfs_add_dir() adds the information of the dir, within a subsystem_dir, to eventfs and dynamically creates these directories when they are accessed. > > Signed-off-by: Ajay Kaher <akaher@vmware.com> > Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> > Tested-by: Ching-lin Yu <chinglinyu@google.com> > Reported-by: kernel test robot <lkp@intel.com> > Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com > --- > fs/tracefs/Makefile | 1 + > fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++ > include/linux/tracefs.h | 29 +++++ > kernel/trace/trace.h | 1 + > 4 files changed, 303 insertions(+) > create mode 100644 fs/tracefs/event_inode.c > > diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile > index 7c35a282b..73c56da8e 100644 > --- a/fs/tracefs/Makefile > +++ b/fs/tracefs/Makefile > @@ -1,5 +1,6 @@ > # SPDX-License-Identifier: GPL-2.0-only > tracefs-objs := inode.o > +tracefs-objs += event_inode.o > > obj-$(CONFIG_TRACING) += tracefs.o > > diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c > new file mode 100644 > index 000000000..a48ce23c0 > --- /dev/null > +++ b/fs/tracefs/event_inode.c > @@ -0,0 +1,272 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * event_inode.c - part of tracefs, a pseudo file system for activating tracing > + * > + * Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org> > + * Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com> > + * > + * eventfs is used to show trace events with one set of dentries > + * > + * eventfs stores meta-data of files/dirs and skip to create object of > + * inodes/dentries. As and when requires, eventfs will create the > + * inodes/dentries for only required files/directories. Also eventfs > + * would delete the inodes/dentries once no more requires but preserve > + * the meta data. > + */ > +#include <linux/fsnotify.h> > +#include <linux/fs.h> > +#include <linux/namei.h> > +#include <linux/security.h> > +#include <linux/tracefs.h> > +#include <linux/kref.h> > +#include <linux/delay.h> > +#include "internal.h" > + > +/** > + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem > + * @dentry: a pointer to dentry > + * > + * helper function to return crossponding eventfs_rwsem for given dentry > + */ > +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry) > +{ > + if (S_ISDIR(dentry->d_inode->i_mode)) > + return (struct rw_semaphore *)dentry->d_inode->i_private; > + else > + return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private; > +} > + > +/** > + * eventfs_down_read - acquire read lock function > + * @eventfs_rwsem: a pointer to rw_semaphore > + * > + * helper function to perform read lock. Nested locking requires because > + * lookup(), release() requires read lock, these could be called directly > + * or from open(), remove() which already hold the read/write lock. > + */ > +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem) > +{ > + down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING); > +} > + > +/** > + * eventfs_up_read - release read lock function > + * @eventfs_rwsem: a pointer to rw_semaphore > + * > + * helper function to release eventfs_rwsem lock if locked > + */ > +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem) > +{ > + up_read(eventfs_rwsem); > +} > + > +/** > + * eventfs_down_write - acquire write lock function > + * @eventfs_rwsem: a pointer to rw_semaphore > + * > + * helper function to perform write lock on eventfs_rwsem > + */ > +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem) > +{ > + while (!down_write_trylock(eventfs_rwsem)) > + msleep(10); What's this loop for? Something like that needs a very good explanation in a comment. Loops like these are usually a sign of a workaround for a bug in the design, or worse, simply hides an existing bug. > +} > + > +/** > + * eventfs_up_write - release write lock function > + * @eventfs_rwsem: a pointer to rw_semaphore > + * > + * helper function to perform write lock on eventfs_rwsem > + */ > +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem) > +{ > + up_write(eventfs_rwsem); > +} > + > +static const struct file_operations eventfs_file_operations = { > +}; > + > +static const struct inode_operations eventfs_root_dir_inode_operations = { > +}; > + > +/** > + * eventfs_prepare_ef - helper function to prepare eventfs_file > + * @name: a pointer to a string containing the name of the file/directory > + * to create. > + * @mode: the permission that the file should have. > + * @fop: a pointer to a struct file_operations that should be used for > + * this file/directory. > + * @iop: a pointer to a struct inode_operations that should be used for > + * this file/directory. > + * @data: a pointer to something that the caller will want to get to later > + * on. The inode.i_private pointer will point to this value on > + * the open() call. > + * > + * This function allocate the fill eventfs_file structure. "allocates and fills the" ? > + */ > +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode, > + const struct file_operations *fop, > + const struct inode_operations *iop, > + void *data) > +{ > + struct eventfs_file *ef; > + > + ef = kzalloc(sizeof(*ef), GFP_KERNEL); > + if (!ef) > + return ERR_PTR(-ENOMEM); > + > + ef->name = kstrdup(name, GFP_KERNEL); > + if (!ef->name) { > + kfree(ef); > + return ERR_PTR(-ENOMEM); > + } > + > + if (S_ISDIR(mode)) { > + ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL); > + if (!ef->ei) { > + kfree(ef->name); > + kfree(ef); > + return ERR_PTR(-ENOMEM); > + } > + INIT_LIST_HEAD(&ef->ei->e_top_files); > + } else { > + ef->ei = NULL; > + } > + > + ef->iop = iop; > + ef->fop = fop; > + ef->mode = mode; > + ef->data = data; > + ef->dentry = NULL; > + ef->d_parent = NULL; > + ef->created = false; No need for the initialization to NULL or even the false, as the kzalloc() already did that. > + return ef; > +} > + > +/** > + * eventfs_create_events_dir - create the trace event structure > + * @name: a pointer to a string containing the name of the directory to > + * create. You don't need to add "a pointer" we can see it's a pointer. Just say: * @name: The name of the directory to create Adding more makes it confusing to read. > + * @parent: a pointer to the parent dentry for this file. This should be a > + * directory dentry if set. If this parameter is NULL, then the > + * directory will be created in the root of the tracefs filesystem. > + * @eventfs_rwsem: a pointer to rw_semaphore Same with all the descriptions. > + * > + * This function creates the top of the trace event directory. > + */ > +struct dentry *eventfs_create_events_dir(const char *name, > + struct dentry *parent, > + struct rw_semaphore *eventfs_rwsem) OK, I'm going to have to really look at this. Passing in a lock to the API is just broken. We need to find a way to solve this another way. I'm about to board a plane to JFK shortly, I'm hoping to play with this while flying back. -- Steve > +{ > + struct dentry *dentry = tracefs_start_creating(name, parent); > + struct eventfs_inode *ei; > + struct tracefs_inode *ti; > + struct inode *inode; > + > + if (IS_ERR(dentry)) > + return dentry; > + > + ei = kzalloc(sizeof(*ei), GFP_KERNEL); > + if (!ei) > + return ERR_PTR(-ENOMEM); > + inode = tracefs_get_inode(dentry->d_sb); > + if (unlikely(!inode)) { > + kfree(ei); > + tracefs_failed_creating(dentry); > + return ERR_PTR(-ENOMEM); > + } > + > + init_rwsem(eventfs_rwsem); > + INIT_LIST_HEAD(&ei->e_top_files); > + > + ti = get_tracefs(inode); > + ti->flags |= TRACEFS_EVENT_INODE; > + ti->private = ei; > + > + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; > + inode->i_op = &eventfs_root_dir_inode_operations; > + inode->i_fop = &eventfs_file_operations; > + inode->i_private = eventfs_rwsem; > + > + /* directory inodes start off with i_nlink == 2 (for "." entry) */ > + inc_nlink(inode); > + d_instantiate(dentry, inode); > + inc_nlink(dentry->d_parent->d_inode); > + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); > + return tracefs_end_creating(dentry); > +}
> On 01-Jul-2023, at 7:24 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > !! External Email > > FYI, all subjects should start with a capital letter: > > "eventfs: Implement eventfs dir creation functions" > > On Thu, 1 Jun 2023 14:30:06 +0530 > Ajay Kaher <akaher@vmware.com> wrote: > >> Adding eventfs_file structure which will hold properties of file or dir. >> >> Adding following functions to add dir in eventfs: >> >> eventfs_create_events_dir() directly creates events dir with-in > > "within" is a proper word. > >> tracing folder. >> >> eventfs_add_subsystem_dir() adds the information of subsystem_dir to >> eventfs and dynamically creates subsystem_dir as and when requires. > > "as and when requires" does not make sense. > >> >> eventfs_add_dir() adds the information of dir (which is with-in > > "within" > >> subsystem_dir) to eventfs and dynamically creates these dir as >> and when requires. > > I'm guessing you want to say: > > eventfs_add_dir() adds the information of the dir, within a > subsystem_dir, to eventfs and dynamically creates these > directories when they are accessed. > >> >> Signed-off-by: Ajay Kaher <akaher@vmware.com> >> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> >> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> >> Tested-by: Ching-lin Yu <chinglinyu@google.com> >> Reported-by: kernel test robot <lkp@intel.com> >> Link: https://lore.kernel.org/oe-lkp/202305051619.9a469a9a-yujie.liu@intel.com >> --- >> fs/tracefs/Makefile | 1 + >> fs/tracefs/event_inode.c | 272 +++++++++++++++++++++++++++++++++++++++ >> include/linux/tracefs.h | 29 +++++ >> kernel/trace/trace.h | 1 + >> 4 files changed, 303 insertions(+) >> create mode 100644 fs/tracefs/event_inode.c >> >> diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile >> index 7c35a282b..73c56da8e 100644 >> --- a/fs/tracefs/Makefile >> +++ b/fs/tracefs/Makefile >> @@ -1,5 +1,6 @@ >> # SPDX-License-Identifier: GPL-2.0-only >> tracefs-objs := inode.o >> +tracefs-objs += event_inode.o >> >> obj-$(CONFIG_TRACING) += tracefs.o >> >> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c >> new file mode 100644 >> index 000000000..a48ce23c0 >> --- /dev/null >> +++ b/fs/tracefs/event_inode.c >> @@ -0,0 +1,272 @@ >> +// SPDX-License-Identifier: GPL-2.0-only >> +/* >> + * event_inode.c - part of tracefs, a pseudo file system for activating tracing >> + * >> + * Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org> >> + * Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com> >> + * >> + * eventfs is used to show trace events with one set of dentries >> + * >> + * eventfs stores meta-data of files/dirs and skip to create object of >> + * inodes/dentries. As and when requires, eventfs will create the >> + * inodes/dentries for only required files/directories. Also eventfs >> + * would delete the inodes/dentries once no more requires but preserve >> + * the meta data. >> + */ >> +#include <linux/fsnotify.h> >> +#include <linux/fs.h> >> +#include <linux/namei.h> >> +#include <linux/security.h> >> +#include <linux/tracefs.h> >> +#include <linux/kref.h> >> +#include <linux/delay.h> >> +#include "internal.h" >> + >> +/** >> + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem >> + * @dentry: a pointer to dentry >> + * >> + * helper function to return crossponding eventfs_rwsem for given dentry >> + */ >> +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry) >> +{ >> + if (S_ISDIR(dentry->d_inode->i_mode)) >> + return (struct rw_semaphore *)dentry->d_inode->i_private; >> + else >> + return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private; >> +} >> + >> +/** >> + * eventfs_down_read - acquire read lock function >> + * @eventfs_rwsem: a pointer to rw_semaphore >> + * >> + * helper function to perform read lock. Nested locking requires because >> + * lookup(), release() requires read lock, these could be called directly >> + * or from open(), remove() which already hold the read/write lock. >> + */ >> +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem) >> +{ >> + down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING); >> +} >> + >> +/** >> + * eventfs_up_read - release read lock function >> + * @eventfs_rwsem: a pointer to rw_semaphore >> + * >> + * helper function to release eventfs_rwsem lock if locked >> + */ >> +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem) >> +{ >> + up_read(eventfs_rwsem); >> +} >> + >> +/** >> + * eventfs_down_write - acquire write lock function >> + * @eventfs_rwsem: a pointer to rw_semaphore >> + * >> + * helper function to perform write lock on eventfs_rwsem >> + */ >> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem) >> +{ >> + while (!down_write_trylock(eventfs_rwsem)) >> + msleep(10); > > What's this loop for? Something like that needs a very good explanation > in a comment. Loops like these are usually a sign of a workaround for a > bug in the design, or worse, simply hides an existing bug. > Yes correct, this logic is to solve deadlock: Thread 1 Thread 2 down_read_nested() - read lock acquired down_write() - waiting for write lock to acquire down_read_nested() - deadlock Deadlock is because rwlock wouldn’t allow read lock to be acquired if write lock is waiting. down_write_trylock() wouldn’t add the write lock in waiting queue, hence helps to prevent deadlock scenario. I was stuck with this Deadlock, tried few methods and finally borrowed from cifs, as it’s upstreamed, tested and working in cifs, please refer: https://elixir.bootlin.com/linux/v6.3.1/source/fs/cifs/file.c#L438 Looking further for your input. I will add explanation in v4. >> +} >> + >> +/** >> + * eventfs_up_write - release write lock function >> + * @eventfs_rwsem: a pointer to rw_semaphore >> + * >> + * helper function to perform write lock on eventfs_rwsem >> + */ >> +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem) >> +{ >> + up_write(eventfs_rwsem); >> +} >> + >> +static const struct file_operations eventfs_file_operations = { >> +}; >> + >> +static const struct inode_operations eventfs_root_dir_inode_operations = { >> +}; >> + >> +/** >> + * eventfs_prepare_ef - helper function to prepare eventfs_file >> + * @name: a pointer to a string containing the name of the file/directory >> + * to create. >> + * @mode: the permission that the file should have. >> + * @fop: a pointer to a struct file_operations that should be used for >> + * this file/directory. >> + * @iop: a pointer to a struct inode_operations that should be used for >> + * this file/directory. >> + * @data: a pointer to something that the caller will want to get to later >> + * on. The inode.i_private pointer will point to this value on >> + * the open() call. >> + * >> + * This function allocate the fill eventfs_file structure. > > "allocates and fills the" ? > >> + */ >> +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode, >> + const struct file_operations *fop, >> + const struct inode_operations *iop, >> + void *data) >> +{ >> + struct eventfs_file *ef; >> + >> + ef = kzalloc(sizeof(*ef), GFP_KERNEL); >> + if (!ef) >> + return ERR_PTR(-ENOMEM); >> + >> + ef->name = kstrdup(name, GFP_KERNEL); >> + if (!ef->name) { >> + kfree(ef); >> + return ERR_PTR(-ENOMEM); >> + } >> + >> + if (S_ISDIR(mode)) { >> + ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL); >> + if (!ef->ei) { >> + kfree(ef->name); >> + kfree(ef); >> + return ERR_PTR(-ENOMEM); >> + } >> + INIT_LIST_HEAD(&ef->ei->e_top_files); >> + } else { >> + ef->ei = NULL; >> + } >> + >> + ef->iop = iop; >> + ef->fop = fop; >> + ef->mode = mode; >> + ef->data = data; >> + ef->dentry = NULL; >> + ef->d_parent = NULL; >> + ef->created = false; > > No need for the initialization to NULL or even the false, as the > kzalloc() already did that. > >> + return ef; >> +} >> + >> +/** >> + * eventfs_create_events_dir - create the trace event structure >> + * @name: a pointer to a string containing the name of the directory to >> + * create. > > You don't need to add "a pointer" we can see it's a pointer. Just say: > > * @name: The name of the directory to create > > Adding more makes it confusing to read. > >> + * @parent: a pointer to the parent dentry for this file. This should be a >> + * directory dentry if set. If this parameter is NULL, then the >> + * directory will be created in the root of the tracefs filesystem. >> + * @eventfs_rwsem: a pointer to rw_semaphore > > Same with all the descriptions. > > >> + * >> + * This function creates the top of the trace event directory. >> + */ >> +struct dentry *eventfs_create_events_dir(const char *name, >> + struct dentry *parent, >> + struct rw_semaphore *eventfs_rwsem) > > OK, I'm going to have to really look at this. Passing in a lock to the > API is just broken. We need to find a way to solve this another way. eventfs_rwsem is a member of struct trace_array, I guess we should pass pointer to trace_array. > I'm about to board a plane to JFK shortly, I'm hoping to play with this > while flying back. > I have replied for major concerns. All other minor I will take care in v4. Thanks a lot for giving time to eventfs patches. - Ajay > -- Steve > > >> +{ >> + struct dentry *dentry = tracefs_start_creating(name, parent); >> + struct eventfs_inode *ei; >> + struct tracefs_inode *ti; >> + struct inode *inode; >> + >> + if (IS_ERR(dentry)) >> + return dentry; >> + >> + ei = kzalloc(sizeof(*ei), GFP_KERNEL); >> + if (!ei) >> + return ERR_PTR(-ENOMEM); >> + inode = tracefs_get_inode(dentry->d_sb); >> + if (unlikely(!inode)) { >> + kfree(ei); >> + tracefs_failed_creating(dentry); >> + return ERR_PTR(-ENOMEM); >> + } >> + >> + init_rwsem(eventfs_rwsem); >> + INIT_LIST_HEAD(&ei->e_top_files); >> + >> + ti = get_tracefs(inode); >> + ti->flags |= TRACEFS_EVENT_INODE; >> + ti->private = ei; >> + >> + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; >> + inode->i_op = &eventfs_root_dir_inode_operations; >> + inode->i_fop = &eventfs_file_operations; >> + inode->i_private = eventfs_rwsem; >> + >> + /* directory inodes start off with i_nlink == 2 (for "." entry) */ >> + inc_nlink(inode); >> + d_instantiate(dentry, inode); >> + inc_nlink(dentry->d_parent->d_inode); >> + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); >> + return tracefs_end_creating(dentry); >> +} > > !! External Email: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender.
On Mon, 3 Jul 2023 10:13:22 +0000 Ajay Kaher <akaher@vmware.com> wrote: > >> +/** > >> + * eventfs_down_write - acquire write lock function > >> + * @eventfs_rwsem: a pointer to rw_semaphore > >> + * > >> + * helper function to perform write lock on eventfs_rwsem > >> + */ > >> +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem) > >> +{ > >> + while (!down_write_trylock(eventfs_rwsem)) > >> + msleep(10); > > > > What's this loop for? Something like that needs a very good explanation > > in a comment. Loops like these are usually a sign of a workaround for a > > bug in the design, or worse, simply hides an existing bug. > > > > Yes correct, this logic is to solve deadlock: > > Thread 1 Thread 2 > down_read_nested() - read lock acquired > down_write() - waiting for write lock to acquire > down_read_nested() - deadlock > > Deadlock is because rwlock wouldn’t allow read lock to be acquired if write lock is waiting. > down_write_trylock() wouldn’t add the write lock in waiting queue, hence helps to prevent > deadlock scenario. > > I was stuck with this Deadlock, tried few methods and finally borrowed from cifs, as it’s > upstreamed, tested and working in cifs, please refer: > https://elixir.bootlin.com/linux/v6.3.1/source/fs/cifs/file.c#L438 I just looked at that code and the commit, and I honestly believe that is a horrible hack, and very fragile. It's in the smb code, so it was unlikely reviewed by anyone outside that subsystem. I really do not want to prolificate that solution around the kernel. We need to come up with something else. I also think it's buggy (yes the cifs code is buggy!) because in the comment above the down_read_nested() it says: /* * nested locking. NOTE: rwsems are not allowed to recurse * (which occurs if the same task tries to acquire the same * lock instance multiple times), but multiple locks of the * same lock class might be taken, if the order of the locks * is always the same. This ordering rule can be expressed * to lockdep via the _nested() APIs, but enumerating the * subclasses that are used. (If the nesting relationship is * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. * See Documentation/locking/lockdep-design.rst for more details.) */ So this is NOT a solution (and the cifs code should be fixed too!) Can you show me the exact backtrace where the reader lock gets taken again? We will have to come up with a way to not take the same lock twice. We can also look to see if we can implement this with RCU. What exactly is this rwsem protecting? > > Looking further for your input. I will add explanation in v4. > > > >> +} > >> + [..] > >> + * > >> + * This function creates the top of the trace event directory. > >> + */ > >> +struct dentry *eventfs_create_events_dir(const char *name, > >> + struct dentry *parent, > >> + struct rw_semaphore *eventfs_rwsem) > > > > OK, I'm going to have to really look at this. Passing in a lock to the > > API is just broken. We need to find a way to solve this another way. > > eventfs_rwsem is a member of struct trace_array, I guess we should > pass pointer to trace_array. No, it should not be part of the trace_array. If we can't do this with RCU, then we need to add a descriptor that contains the dentry that is returned above, and have the lock held there. The caller of the eventfs_create_events_dir() should not care about locking. That's an implementation detail that should *not* be part of the API. That is, if you need a lock: struct eventfs_dentry { struct dentry *dentry; struct rwsem *rwsem; }; And then get to that lock by using the container_of() macro. All created eventfs dentry's could have this structure, where the rwsem points to the top one. Again, that's only if we can't do this with RCU. -- Steve > > > > I'm about to board a plane to JFK shortly, I'm hoping to play with this > > while flying back. > > > > I have replied for major concerns. All other minor I will take care in v4. > > Thanks a lot for giving time to eventfs patches. > > - Ajay >
> On 03-Jul-2023, at 8:38 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > I just looked at that code and the commit, and I honestly believe that > is a horrible hack, and very fragile. It's in the smb code, so it was > unlikely reviewed by anyone outside that subsystem. I really do not > want to prolificate that solution around the kernel. We need to come up > with something else. > > I also think it's buggy (yes the cifs code is buggy!) because in the > comment above the down_read_nested() it says: > > /* > * nested locking. NOTE: rwsems are not allowed to recurse > * (which occurs if the same task tries to acquire the same > * lock instance multiple times), but multiple locks of the > * same lock class might be taken, if the order of the locks > * is always the same. This ordering rule can be expressed > * to lockdep via the _nested() APIs, but enumerating the > * subclasses that are used. (If the nesting relationship is > * static then another method for expressing nested locking is > * the explicit definition of lock class keys and the use of > * lockdep_set_class() at lock initialization time. > * See Documentation/locking/lockdep-design.rst for more details.) > */ > > So this is NOT a solution (and the cifs code should be fixed too!) > > Can you show me the exact backtrace where the reader lock gets taken > again? We will have to come up with a way to not take the same lock > twice. [ 244.185505] eventfs_root_lookup+0x37/0x1f0 <--- require read lock [ 244.185509] __lookup_slow+0x72/0x100 [ 244.185511] lookup_one_len+0x6a/0x70 [ 244.185513] eventfs_start_creating+0x58/0xd0 [ 244.185515] ? security_locked_down+0x2e/0x50 [ 244.185518] eventfs_create_file+0x57/0x150 [ 244.185521] dcache_dir_open_wrapper+0x1c6/0x260 <--- require read lock [ 244.185524] ? __pfx_dcache_dir_open_wrapper+0x10/0x10 [ 244.185526] do_dentry_open+0x1ed/0x420 [ 244.185529] vfs_open+0x2d/0x40 > > We can also look to see if we can implement this with RCU. What exactly > is this rwsem protecting? > - struct eventfs_file holds the meta-data for file or dir. https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28 - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file ' and elements of struct eventfs_file. I tried one more solution i.e by checking owner of lock: static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); } But rwsem_owner() is static. > >> >> Looking further for your input. I will add explanation in v4. >> >> >>>> +} >>>> + > > [..] > >>>> + * >>>> + * This function creates the top of the trace event directory. >>>> + */ >>>> +struct dentry *eventfs_create_events_dir(const char *name, >>>> + struct dentry *parent, >>>> + struct rw_semaphore *eventfs_rwsem) >>> >>> OK, I'm going to have to really look at this. Passing in a lock to the >>> API is just broken. We need to find a way to solve this another way. >> >> eventfs_rwsem is a member of struct trace_array, I guess we should >> pass pointer to trace_array. > > No, it should not be part of the trace_array. If we can't do this with > RCU, then we need to add a descriptor that contains the dentry that is > returned above, and have the lock held there. The caller of the > eventfs_create_events_dir() should not care about locking. That's an > implementation detail that should *not* be part of the API. > > That is, if you need a lock: > > struct eventfs_dentry { > struct dentry *dentry; > struct rwsem *rwsem; > }; > > And then get to that lock by using the container_of() macro. All > created eventfs dentry's could have this structure, where the rwsem > points to the top one. Again, that's only if we can't do this with RCU. Ok. Let’s first fix locking issue. -Ajay
On Mon, 3 Jul 2023 18:51:22 +0000 Ajay Kaher <akaher@vmware.com> wrote: > > > > We can also look to see if we can implement this with RCU. What exactly > > is this rwsem protecting? > > > > - struct eventfs_file holds the meta-data for file or dir. > https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28 > - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file > ' and elements of struct eventfs_file. RCU is usually the perfect solution for protecting link lists though. I'll take a look at this when I get back to work. -- Steve
On Mon, 10 Jul 2023 18:53:53 +0000 Ajay Kaher <akaher@vmware.com> wrote: > > On 10-Jul-2023, at 7:24 AM, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > !! External Email > > > > On Mon, 3 Jul 2023 15:52:26 -0400 > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > >> On Mon, 3 Jul 2023 18:51:22 +0000 > >> Ajay Kaher <akaher@vmware.com> wrote: > >> > >>>> > >>>> We can also look to see if we can implement this with RCU. What exactly > >>>> is this rwsem protecting? > >>>> > >>> > >>> - struct eventfs_file holds the meta-data for file or dir. > >>> https://github.com/intel-lab-lkp/linux/blob/dfe0dc15a73261ed83cdc728e43f4b3d4e315aae/include/linux/tracefs.h#L28 > >>> - eventfs_rwsem is supposed to protect the 'link-list which is made of struct eventfs_file > >>> ' and elements of struct eventfs_file. > >> > >> RCU is usually the perfect solution for protecting link lists though. I'll > >> take a look at this when I get back to work. > >> > > > > So I did the below patch on top of this series. If you could fold this > > into the appropriate patches, it should get us closer to an acceptable > > solution. > > > > What I did was: > > > > 1. Moved the struct eventfs_file and eventfs_inode into event_inode.c as it > > really should not be exposed to all users. > > > > 2. Added a recursion check to eventfs_remove_rec() as it is really > > dangerous to have unchecked recursion in the kernel (we do have a fixed > > size stack). > > > > 3. Removed all the eventfs_rwsem code and replaced it with an srcu lock for > > the readers, and a mutex to synchronize the writers of the list. > > > > 4. Added a eventfs_mutex that is used for the modifications of the > > dentry itself (as well as modifying the list from 3 above). > > > > 5. Have the free use srcu callbacks. After the srcu grace periods are done, > > it adds the eventfs_file onto a llist (lockless link list) and wakes up a > > work queue. Then the work queue does the freeing (this needs to be done in > > task/workqueue context, as srcu callbacks are done in softirq context). > > > > This appears to pass through some of my instance stress tests as well as > > the in tree ftrace selftests. > > > > Awesome :) > > I have manually applied the patches and ftracetest results are same as v3. > No more complains from lockdep. > > I will merge this into appropriate patches of v3 and soon send v4. > > You have renamed eventfs_create_dir() to create_dir(), and kept eventfs_create_dir() > just a wrapper with lock, same for eventfs_create_file(). However these wrapper no where > used, I will drop these wrappers. Ah, I thought that because they started with "eventfs_" that they were used for some fops pointer. Note, I try to avoid using the "eventfs_" naming for static functions that are not exported elsewhere. > > I was trying to have independent lock for each instance of events. As common lock > for every instance of events is not must. We can find a way to make the lock for the root later. Let's get it working first before we optimize it. I do not want to expose any locking to the users of this interface. > > Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or > ignored by lkml. I just wanted to track the auto test results from linux-kselftest. Yeah, claws-mail has an issue with some emails with quotes in it (sometimes drops the second quote). Sad part is, it happens after I hit send, and it is not part of the email. I'll send this reply now, but I bet it's going to happen again. Let's see :-/ I checked the To and Cc's and they all have the proper quotes. Let's see what ends up in my "Sent" folder. -- Steve
On Mon, 10 Jul 2023 15:06:06 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > > > > Something was broken in your mail (I guess cc list) and couldn’t reach to lkml or > > ignored by lkml. I just wanted to track the auto test results from linux-kselftest. > > Yeah, claws-mail has an issue with some emails with quotes in it (sometimes > drops the second quote). Sad part is, it happens after I hit send, and it > is not part of the email. I'll send this reply now, but I bet it's going to happen again. > > Let's see :-/ I checked the To and Cc's and they all have the proper > quotes. Let's see what ends up in my "Sent" folder. Sorry for the spam, but I just upgraded my claws-mail from 3.19.0 to 3.19.1 and I just want to see if it fails again. -- Steve
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile index 7c35a282b..73c56da8e 100644 --- a/fs/tracefs/Makefile +++ b/fs/tracefs/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only tracefs-objs := inode.o +tracefs-objs += event_inode.o obj-$(CONFIG_TRACING) += tracefs.o diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c new file mode 100644 index 000000000..a48ce23c0 --- /dev/null +++ b/fs/tracefs/event_inode.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * event_inode.c - part of tracefs, a pseudo file system for activating tracing + * + * Copyright (C) 2020-22 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org> + * Copyright (C) 2020-22 VMware Inc, author: Ajay Kaher <akaher@vmware.com> + * + * eventfs is used to show trace events with one set of dentries + * + * eventfs stores meta-data of files/dirs and skip to create object of + * inodes/dentries. As and when requires, eventfs will create the + * inodes/dentries for only required files/directories. Also eventfs + * would delete the inodes/dentries once no more requires but preserve + * the meta data. + */ +#include <linux/fsnotify.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/security.h> +#include <linux/tracefs.h> +#include <linux/kref.h> +#include <linux/delay.h> +#include "internal.h" + +/** + * eventfs_dentry_to_rwsem - Return corresponding eventfs_rwsem + * @dentry: a pointer to dentry + * + * helper function to return crossponding eventfs_rwsem for given dentry + */ +static struct rw_semaphore *eventfs_dentry_to_rwsem(struct dentry *dentry) +{ + if (S_ISDIR(dentry->d_inode->i_mode)) + return (struct rw_semaphore *)dentry->d_inode->i_private; + else + return (struct rw_semaphore *)dentry->d_parent->d_inode->i_private; +} + +/** + * eventfs_down_read - acquire read lock function + * @eventfs_rwsem: a pointer to rw_semaphore + * + * helper function to perform read lock. Nested locking requires because + * lookup(), release() requires read lock, these could be called directly + * or from open(), remove() which already hold the read/write lock. + */ +static void eventfs_down_read(struct rw_semaphore *eventfs_rwsem) +{ + down_read_nested(eventfs_rwsem, SINGLE_DEPTH_NESTING); +} + +/** + * eventfs_up_read - release read lock function + * @eventfs_rwsem: a pointer to rw_semaphore + * + * helper function to release eventfs_rwsem lock if locked + */ +static void eventfs_up_read(struct rw_semaphore *eventfs_rwsem) +{ + up_read(eventfs_rwsem); +} + +/** + * eventfs_down_write - acquire write lock function + * @eventfs_rwsem: a pointer to rw_semaphore + * + * helper function to perform write lock on eventfs_rwsem + */ +static void eventfs_down_write(struct rw_semaphore *eventfs_rwsem) +{ + while (!down_write_trylock(eventfs_rwsem)) + msleep(10); +} + +/** + * eventfs_up_write - release write lock function + * @eventfs_rwsem: a pointer to rw_semaphore + * + * helper function to perform write lock on eventfs_rwsem + */ +static void eventfs_up_write(struct rw_semaphore *eventfs_rwsem) +{ + up_write(eventfs_rwsem); +} + +static const struct file_operations eventfs_file_operations = { +}; + +static const struct inode_operations eventfs_root_dir_inode_operations = { +}; + +/** + * eventfs_prepare_ef - helper function to prepare eventfs_file + * @name: a pointer to a string containing the name of the file/directory + * to create. + * @mode: the permission that the file should have. + * @fop: a pointer to a struct file_operations that should be used for + * this file/directory. + * @iop: a pointer to a struct inode_operations that should be used for + * this file/directory. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * + * This function allocate the fill eventfs_file structure. + */ +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode, + const struct file_operations *fop, + const struct inode_operations *iop, + void *data) +{ + struct eventfs_file *ef; + + ef = kzalloc(sizeof(*ef), GFP_KERNEL); + if (!ef) + return ERR_PTR(-ENOMEM); + + ef->name = kstrdup(name, GFP_KERNEL); + if (!ef->name) { + kfree(ef); + return ERR_PTR(-ENOMEM); + } + + if (S_ISDIR(mode)) { + ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL); + if (!ef->ei) { + kfree(ef->name); + kfree(ef); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&ef->ei->e_top_files); + } else { + ef->ei = NULL; + } + + ef->iop = iop; + ef->fop = fop; + ef->mode = mode; + ef->data = data; + ef->dentry = NULL; + ef->d_parent = NULL; + ef->created = false; + return ef; +} + +/** + * eventfs_create_events_dir - create the trace event structure + * @name: a pointer to a string containing the name of the directory to + * create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * directory will be created in the root of the tracefs filesystem. + * @eventfs_rwsem: a pointer to rw_semaphore + * + * This function creates the top of the trace event directory. + */ +struct dentry *eventfs_create_events_dir(const char *name, + struct dentry *parent, + struct rw_semaphore *eventfs_rwsem) +{ + struct dentry *dentry = tracefs_start_creating(name, parent); + struct eventfs_inode *ei; + struct tracefs_inode *ti; + struct inode *inode; + + if (IS_ERR(dentry)) + return dentry; + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return ERR_PTR(-ENOMEM); + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) { + kfree(ei); + tracefs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); + } + + init_rwsem(eventfs_rwsem); + INIT_LIST_HEAD(&ei->e_top_files); + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE; + ti->private = ei; + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_fop = &eventfs_file_operations; + inode->i_private = eventfs_rwsem; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return tracefs_end_creating(dentry); +} + +/** + * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later + * @name: a pointer to a string containing the name of the file to create. + * @parent: a pointer to the parent dentry for this dir. + * @eventfs_rwsem: a pointer to rw_semaphore + * + * This function adds eventfs subsystem dir to list. + * And all these dirs are created on the fly when they are looked up, + * and the dentry and inodes will be removed when they are done. + */ +struct eventfs_file *eventfs_add_subsystem_dir(const char *name, + struct dentry *parent, + struct rw_semaphore *eventfs_rwsem) +{ + struct tracefs_inode *ti_parent; + struct eventfs_inode *ei_parent; + struct eventfs_file *ef; + + if (!parent) + return ERR_PTR(-EINVAL); + + ti_parent = get_tracefs(parent->d_inode); + ei_parent = ti_parent->private; + + ef = eventfs_prepare_ef(name, + S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &eventfs_file_operations, + &eventfs_root_dir_inode_operations, + (void *) eventfs_rwsem); + + if (IS_ERR(ef)) + return ef; + + eventfs_down_write(eventfs_rwsem); + list_add_tail(&ef->list, &ei_parent->e_top_files); + ef->d_parent = parent; + eventfs_up_write(eventfs_rwsem); + return ef; +} + +/** + * eventfs_add_dir - add eventfs dir to list to create later + * @name: a pointer to a string containing the name of the file to create. + * @ef_parent: a pointer to the parent eventfs_file for this dir. + * @eventfs_rwsem: a pointer to rw_semaphore + * + * This function adds eventfs dir to list. + * And all these dirs are created on the fly when they are looked up, + * and the dentry and inodes will be removed when they are done. + */ +struct eventfs_file *eventfs_add_dir(const char *name, + struct eventfs_file *ef_parent, + struct rw_semaphore *eventfs_rwsem) +{ + struct eventfs_file *ef; + + if (!ef_parent) + return ERR_PTR(-EINVAL); + + ef = eventfs_prepare_ef(name, + S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &eventfs_file_operations, + &eventfs_root_dir_inode_operations, + (void *) eventfs_rwsem); + + if (IS_ERR(ef)) + return ef; + + eventfs_down_write(eventfs_rwsem); + list_add_tail(&ef->list, &ef_parent->ei->e_top_files); + ef->d_parent = ef_parent->dentry; + eventfs_up_write(eventfs_rwsem); + return ef; +} diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 999124459..aeca6761f 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -21,6 +21,35 @@ struct file_operations; #ifdef CONFIG_TRACING +struct eventfs_inode { + struct list_head e_top_files; +}; + +struct eventfs_file { + const char *name; + struct dentry *d_parent; + struct dentry *dentry; + struct list_head list; + struct eventfs_inode *ei; + const struct file_operations *fop; + const struct inode_operations *iop; + void *data; + umode_t mode; + bool created; +}; + +struct dentry *eventfs_create_events_dir(const char *name, + struct dentry *parent, + struct rw_semaphore *eventfs_rwsem); + +struct eventfs_file *eventfs_add_subsystem_dir(const char *name, + struct dentry *parent, + struct rw_semaphore *eventfs_rwsem); + +struct eventfs_file *eventfs_add_dir(const char *name, + struct eventfs_file *ef_parent, + struct rw_semaphore *eventfs_rwsem); + struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe92..b895c3346 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -359,6 +359,7 @@ struct trace_array { struct dentry *options; struct dentry *percpu_dir; struct dentry *event_dir; + struct rw_semaphore eventfs_rwsem; struct trace_options *topts; struct list_head systems; struct list_head events;