diff mbox series

gpiolib: cdev: fix NULL-pointer dereferences

Message ID 20221125153257.528826-1-brgl@bgdev.pl
State New
Headers show
Series gpiolib: cdev: fix NULL-pointer dereferences | expand

Commit Message

Bartosz Golaszewski Nov. 25, 2022, 3:32 p.m. UTC
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

There are several places where we can crash the kernel by requesting
lines, unbinding the GPIO device, then calling any of the system calls
relevant to the GPIO character device's annonymous file descriptors:
ioctl(), read(), poll().

While I observed it with the GPIO simulator, it will also happen for any
of the GPIO devices that can be hot-unplugged - for instance any HID GPIO
expander (e.g. CP2112).

This affects both v1 and v2 uAPI.

Fix this by simply checking if the GPIO chip pointer is not NULL.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib-cdev.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

Comments

Kent Gibson Nov. 25, 2022, 4:24 p.m. UTC | #1
On Fri, Nov 25, 2022 at 04:32:57PM +0100, Bartosz Golaszewski wrote:
> From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> 
> There are several places where we can crash the kernel by requesting
> lines, unbinding the GPIO device, then calling any of the system calls
> relevant to the GPIO character device's annonymous file descriptors:
> ioctl(), read(), poll().
> 
> While I observed it with the GPIO simulator, it will also happen for any
> of the GPIO devices that can be hot-unplugged - for instance any HID GPIO
> expander (e.g. CP2112).
> 
> This affects both v1 and v2 uAPI.
> 
> Fix this by simply checking if the GPIO chip pointer is not NULL.
> 

Fixes: ??

And split for v1 and v2 as the Fixes for those will differ?

> Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> ---
>  drivers/gpio/gpiolib-cdev.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
> index 0cb6b468f364..d5632742942a 100644
> --- a/drivers/gpio/gpiolib-cdev.c
> +++ b/drivers/gpio/gpiolib-cdev.c
> @@ -201,6 +201,9 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd,
>  	unsigned int i;
>  	int ret;
>  
> +	if (!lh->gdev->chip)
> +		return -ENODEV;
> +

Is there anything to prevent the chip being removed by another thread
between this check and subsequent usage?

Cheers,
Kent.

>  	switch (cmd) {
>  	case GPIOHANDLE_GET_LINE_VALUES_IOCTL:
>  		/* NOTE: It's okay to read values of output lines */
> @@ -1384,6 +1387,9 @@ static long linereq_ioctl(struct file *file, unsigned int cmd,
>  	struct linereq *lr = file->private_data;
>  	void __user *ip = (void __user *)arg;
>  
> +	if (!lr->gdev->chip)
> +		return -ENODEV;
> +
>  	switch (cmd) {
>  	case GPIO_V2_LINE_GET_VALUES_IOCTL:
>  		return linereq_get_values(lr, ip);
> @@ -1716,6 +1722,9 @@ static __poll_t lineevent_poll(struct file *file,
>  	struct lineevent_state *le = file->private_data;
>  	__poll_t events = 0;
>  
> +	if (!le->gdev->chip)
> +		return -ENODEV;
> +
>  	poll_wait(file, &le->wait, wait);
>  
>  	if (!kfifo_is_empty_spinlocked_noirqsave(&le->events, &le->wait.lock))
> @@ -1740,6 +1749,9 @@ static ssize_t lineevent_read(struct file *file,
>  	ssize_t ge_size;
>  	int ret;
>  
> +	if (!le->gdev->chip)
> +		return -ENODEV;
> +
>  	/*
>  	 * When compatible system call is being used the struct gpioevent_data,
>  	 * in case of at least ia32, has different size due to the alignment
> @@ -1821,6 +1833,9 @@ static long lineevent_ioctl(struct file *file, unsigned int cmd,
>  	void __user *ip = (void __user *)arg;
>  	struct gpiohandle_data ghd;
>  
> +	if (!le->gdev->chip)
> +		return -ENODEV;
> +
>  	/*
>  	 * We can get the value for an event line but not set it,
>  	 * because it is input by definition.
> -- 
> 2.37.2
>
Bartosz Golaszewski Nov. 25, 2022, 4:48 p.m. UTC | #2
On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:
>
> On Fri, Nov 25, 2022 at 04:32:57PM +0100, Bartosz Golaszewski wrote:
> > From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> >
> > There are several places where we can crash the kernel by requesting
> > lines, unbinding the GPIO device, then calling any of the system calls
> > relevant to the GPIO character device's annonymous file descriptors:
> > ioctl(), read(), poll().
> >
> > While I observed it with the GPIO simulator, it will also happen for any
> > of the GPIO devices that can be hot-unplugged - for instance any HID GPIO
> > expander (e.g. CP2112).
> >
> > This affects both v1 and v2 uAPI.
> >
> > Fix this by simply checking if the GPIO chip pointer is not NULL.
> >
>
> Fixes: ??
>
> And split for v1 and v2 as the Fixes for those will differ?
>
> > Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
> > ---
> >  drivers/gpio/gpiolib-cdev.c | 15 +++++++++++++++
> >  1 file changed, 15 insertions(+)
> >
> > diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
> > index 0cb6b468f364..d5632742942a 100644
> > --- a/drivers/gpio/gpiolib-cdev.c
> > +++ b/drivers/gpio/gpiolib-cdev.c
> > @@ -201,6 +201,9 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd,
> >       unsigned int i;
> >       int ret;
> >
> > +     if (!lh->gdev->chip)
> > +             return -ENODEV;
> > +
>
> Is there anything to prevent the chip being removed by another thread
> between this check and subsequent usage?
>

Eh... not really, no. The issue we have here seems to be the same as
the one Laurent Pinchart described back in 2015[1] and revisited
during his 2022 linux plumbers presentation[2], except he blamed it on
devres, whereas I think the problem is much deeper and devres has
nothing to do with it.

Ideally we'd need a global fortifying of the driver model against
hot-unplug related device unbinding.

After a quick glance at the relevant code, I think we'd need to add
some flag to struct file for the vfs to check on any fs operation and
return an error early if user-space tries to operate on an fd
associated with a removed device. I'm not sure yet if that's feasible
globally or even the right solution at all - just brainstorming here.

Then at the subsystem level, the GPIO device struct would need a lock
that would be taken by every user-space operation AND the code
unregistering the device so that we don't do what you described (i.e.
if there's a thread doing a read(), then let's wait until it returns
before we drop the device).

This wouldn't fix the case in which the same situation happened in a
kernel driver but crashing the kernel from within is a much lesser
offense than allowing user-space to crash it.

So this patch is just papering over for now I suppose.

Bart

[1] https://lkml.org/lkml/2015/7/14/741
[2] https://www.youtube.com/watch?v=kW8LHWlJPTU
Andy Shevchenko Nov. 25, 2022, 6:02 p.m. UTC | #3
On Fri, Nov 25, 2022 at 07:56:10PM +0200, Andy Shevchenko wrote:
> On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:

...

An off topic here, can we have this [1] being applied meanwhile?

[1]: https://lore.kernel.org/linux-gpio/d1e86166-b2b0-a235-fe9e-be9ee4d93290@huawei.com/
Bartosz Golaszewski Nov. 25, 2022, 9:03 p.m. UTC | #4
On Fri, Nov 25, 2022 at 6:56 PM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> > On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:
>
> ...
>
> > Then at the subsystem level, the GPIO device struct would need a lock
> > that would be taken by every user-space operation AND the code
> > unregistering the device so that we don't do what you described (i.e.
> > if there's a thread doing a read(), then let's wait until it returns
> > before we drop the device).
>
> It's called a reference counting, basically you need to get device and then
> put when it makes sense.
>

Andy: I am aware of struct device reference counting but this isn't
it. You can count references all you want, but when I disconnect my
CP2112, the USB bus calls gpiochip_remove(), struct gpio_chip * inside
struct gpio_device is set to NULL and while the underlying struct
device itself is still alive, the GPIO chip is no longer usable.

Reference counting won't help because the device is no longer there,
so this behavior is correct but there's an issue with user-space still
being able to hold certain resources and we need to make sure that
when it tries to use them, we return an error instead of crashing.

I think that a good solution is to make sure, we cannot set gdev->gc
to NULL as long as there are user-space operations in progress. After
all, it's better to try to send a USB request to an unplugged device
than to dereference a NULL pointer. To that end, we could have a
user-space lock that would also be taken by gpiochip_remove().

But this is still a per-subsystem solution. Most other subsystems
suffer from the same issue.

Bartosz
Andy Shevchenko Nov. 25, 2022, 9:33 p.m. UTC | #5
On Fri, Nov 25, 2022 at 10:03:06PM +0100, Bartosz Golaszewski wrote:
> On Fri, Nov 25, 2022 at 6:56 PM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> > > On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:

...

> > > Then at the subsystem level, the GPIO device struct would need a lock
> > > that would be taken by every user-space operation AND the code
> > > unregistering the device so that we don't do what you described (i.e.
> > > if there's a thread doing a read(), then let's wait until it returns
> > > before we drop the device).
> >
> > It's called a reference counting, basically you need to get device and then
> > put when it makes sense.
> 
> Andy: I am aware of struct device reference counting but this isn't
> it. You can count references all you want, but when I disconnect my
> CP2112, the USB bus calls gpiochip_remove(), struct gpio_chip * inside
> struct gpio_device is set to NULL and while the underlying struct
> device itself is still alive, the GPIO chip is no longer usable.
> 
> Reference counting won't help because the device is no longer there,
> so this behavior is correct but there's an issue with user-space still
> being able to hold certain resources and we need to make sure that
> when it tries to use them, we return an error instead of crashing.

Thank you for the detailed explanation of the case.

> I think that a good solution is to make sure, we cannot set gdev->gc
> to NULL as long as there are user-space operations in progress. After
> all, it's better to try to send a USB request to an unplugged device
> than to dereference a NULL pointer. To that end, we could have a
> user-space lock that would also be taken by gpiochip_remove().
> 
> But this is still a per-subsystem solution. Most other subsystems
> suffer from the same issue.

Yeah, many subsystems are not ready for hotplug...
Kent Gibson Nov. 26, 2022, 12:39 a.m. UTC | #6
On Fri, Nov 25, 2022 at 10:03:06PM +0100, Bartosz Golaszewski wrote:
> On Fri, Nov 25, 2022 at 6:56 PM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> >
> > On Fri, Nov 25, 2022 at 05:48:02PM +0100, Bartosz Golaszewski wrote:
> > > On Fri, Nov 25, 2022 at 5:24 PM Kent Gibson <warthog618@gmail.com> wrote:
> >
> > ...
> >
> > > Then at the subsystem level, the GPIO device struct would need a lock
> > > that would be taken by every user-space operation AND the code
> > > unregistering the device so that we don't do what you described (i.e.
> > > if there's a thread doing a read(), then let's wait until it returns
> > > before we drop the device).
> >
> > It's called a reference counting, basically you need to get device and then
> > put when it makes sense.
> >
> 
> Andy: I am aware of struct device reference counting but this isn't
> it. You can count references all you want, but when I disconnect my
> CP2112, the USB bus calls gpiochip_remove(), struct gpio_chip * inside
> struct gpio_device is set to NULL and while the underlying struct
> device itself is still alive, the GPIO chip is no longer usable.
> 
> Reference counting won't help because the device is no longer there,
> so this behavior is correct but there's an issue with user-space still
> being able to hold certain resources and we need to make sure that
> when it tries to use them, we return an error instead of crashing.
> 
> I think that a good solution is to make sure, we cannot set gdev->gc
> to NULL as long as there are user-space operations in progress. After
> all, it's better to try to send a USB request to an unplugged device
> than to dereference a NULL pointer. To that end, we could have a
> user-space lock that would also be taken by gpiochip_remove().
> 

This is basically the answer I was hoping for - that there is some
barrier in place to prevent chip removal while an ioctl is active.
Then the check makes total sense - it is ensuring that the chip wasn't
removed before the ioctl began and the barrier went up.

On the other end, the caller of gpiochip_remove() needs to be prepared
to gracefully fail calls on the chip until gpiochip_remove() returns.
You would hope that is already the case...

> But this is still a per-subsystem solution. Most other subsystems
> suffer from the same issue.
> 

Does that prevent us addressing the problem in gpio until a more general
solution comes along?

Anyway, I'm basically ok with your patch as a first step, as it greatly
reduces the chances of triggering the fault, but it is only a band-aid
over a larger issue and a more complete solution would be preferable.
Without that, highlight in the checkin comment that it is not a complete
fix.

Cheers,
Kent.
diff mbox series

Patch

diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
index 0cb6b468f364..d5632742942a 100644
--- a/drivers/gpio/gpiolib-cdev.c
+++ b/drivers/gpio/gpiolib-cdev.c
@@ -201,6 +201,9 @@  static long linehandle_ioctl(struct file *file, unsigned int cmd,
 	unsigned int i;
 	int ret;
 
+	if (!lh->gdev->chip)
+		return -ENODEV;
+
 	switch (cmd) {
 	case GPIOHANDLE_GET_LINE_VALUES_IOCTL:
 		/* NOTE: It's okay to read values of output lines */
@@ -1384,6 +1387,9 @@  static long linereq_ioctl(struct file *file, unsigned int cmd,
 	struct linereq *lr = file->private_data;
 	void __user *ip = (void __user *)arg;
 
+	if (!lr->gdev->chip)
+		return -ENODEV;
+
 	switch (cmd) {
 	case GPIO_V2_LINE_GET_VALUES_IOCTL:
 		return linereq_get_values(lr, ip);
@@ -1716,6 +1722,9 @@  static __poll_t lineevent_poll(struct file *file,
 	struct lineevent_state *le = file->private_data;
 	__poll_t events = 0;
 
+	if (!le->gdev->chip)
+		return -ENODEV;
+
 	poll_wait(file, &le->wait, wait);
 
 	if (!kfifo_is_empty_spinlocked_noirqsave(&le->events, &le->wait.lock))
@@ -1740,6 +1749,9 @@  static ssize_t lineevent_read(struct file *file,
 	ssize_t ge_size;
 	int ret;
 
+	if (!le->gdev->chip)
+		return -ENODEV;
+
 	/*
 	 * When compatible system call is being used the struct gpioevent_data,
 	 * in case of at least ia32, has different size due to the alignment
@@ -1821,6 +1833,9 @@  static long lineevent_ioctl(struct file *file, unsigned int cmd,
 	void __user *ip = (void __user *)arg;
 	struct gpiohandle_data ghd;
 
+	if (!le->gdev->chip)
+		return -ENODEV;
+
 	/*
 	 * We can get the value for an event line but not set it,
 	 * because it is input by definition.