Message ID | 20240122220350.1449413-1-davidm@egauge.net |
---|---|
State | Superseded |
Headers | show |
Series | wifi: wilc1000: validate chip id during bus probe | expand |
On 1/22/24 23:03, David Mosberger-Tang wrote: > Previously, the driver created a net device (typically wlan0) as soon > as the module was loaded. This commit changes the driver to follow > normal Linux convention of creating the net device only when bus > probing detects a supported chip. I would gladly help review/test the patch, but please give us some time between versions to take a look (even if you can mention if you found issues yourself). Also, each version should be a separate thread, bearing the new version in the "Subject" line. Additionally (to answer your cover letter), the patches must target the wireless branches (likely wireless-testing), not linux-next (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide) > Signed-off-by: David Mosberger-Tang <davidm@egauge.net> > --- > V2 -> V3: Add missing forward declarations, actually :-( > > drivers/net/wireless/microchip/wilc1000/spi.c | 133 ++++++++++++------ > .../net/wireless/microchip/wilc1000/wlan.h | 1 + > 2 files changed, 90 insertions(+), 44 deletions(-) > > diff --git a/drivers/net/wireless/microchip/wilc1000/spi.c b/drivers/net/wireless/microchip/wilc1000/spi.c > index 77b4cdff73c3..dd6935dc1bc9 100644 > --- a/drivers/net/wireless/microchip/wilc1000/spi.c > +++ b/drivers/net/wireless/microchip/wilc1000/spi.c > @@ -42,7 +42,7 @@ MODULE_PARM_DESC(enable_crc16, > #define WILC_SPI_RSP_HDR_EXTRA_DATA 8 > > struct wilc_spi { > - bool isinit; /* true if SPI protocol has been configured */ > + bool isinit; /* true if wilc_spi_init was successful */ > bool probing_crc; /* true if we're probing chip's CRC config */ > bool crc7_enabled; /* true if crc7 is currently enabled */ > bool crc16_enabled; /* true if crc16 is currently enabled */ > @@ -55,6 +55,8 @@ struct wilc_spi { > static const struct wilc_hif_func wilc_hif_spi; > > static int wilc_spi_reset(struct wilc *wilc); > +static int wilc_spi_configure_bus_protocol(struct wilc *wilc); > +static int wilc_validate_chipid(struct wilc *wilc); > > /******************************************** > * > @@ -232,6 +234,22 @@ static int wilc_bus_probe(struct spi_device *spi) > } > clk_prepare_enable(wilc->rtc_clk); > > + /* we need power to configure the bus protocol and to read the chip id: */ > + > + wilc_wlan_power(wilc, true); > + > + ret = wilc_spi_configure_bus_protocol(wilc); > + > + if (ret == 0) > + ret = wilc_validate_chipid(wilc); > + > + wilc_wlan_power(wilc, false); > + > + if (ret) { > + ret = -ENODEV; > + goto netdev_cleanup; I have a working wilc-over-spi setup with which I can easily unplug the module, so I gave a try to your series, and while the lack of chip detect indeed makes the netdevice registration not executed, I've got a nasty kasan warning: driver_probe_device from __driver_attach+0x1a0/0x29c [141/1863] __driver_attach from bus_for_each_dev+0xf0/0x14c bus_for_each_dev from bus_add_driver+0x130/0x288 bus_add_driver from driver_register+0xd4/0x1c0 driver_register from do_one_initcall+0xfc/0x204 do_one_initcall from kernel_init_freeable+0x240/0x2a0 kernel_init_freeable from kernel_init+0x20/0x144 kernel_init from ret_from_fork+0x14/0x28 Exception stack(0xc3163fb0 to 0xc3163ff8) 3fa0: 00000000 00000000 00000000 00000000 3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 3fe0: 00000000 00000000 00000000 00000000 00000013 00000000 Allocated by task 1: kasan_set_track+0x3c/0x5c __kasan_kmalloc+0x8c/0x94 __kmalloc_node+0x64/0x184 kvmalloc_node+0x48/0x114 alloc_netdev_mqs+0x68/0x664 alloc_etherdev_mqs+0x28/0x34 wilc_netdev_ifc_init+0x34/0x37c wilc_cfg80211_init+0x278/0x330 wilc_bus_probe+0xb4/0x398 spi_probe+0xb8/0xdc really_probe+0x134/0x588 __driver_probe_device+0xe0/0x288 driver_probe_device+0x60/0x118 __driver_attach+0x1a0/0x29c bus_for_each_dev+0xf0/0x14c bus_add_driver+0x130/0x288 driver_register+0xd4/0x1c0 do_one_initcall+0xfc/0x204 kernel_init_freeable+0x240/0x2a0 kernel_init+0x20/0x144 ret_from_fork+0x14/0x28 Freed by task 1: kasan_set_track+0x3c/0x5c kasan_save_free_info+0x30/0x3c __kasan_slab_free+0xe4/0x12c __kmem_cache_free+0x94/0x1cc device_release+0x54/0xf8 kobject_put+0xf4/0x238 netdev_run_todo+0x414/0x7dc wilc_netdev_cleanup+0xe4/0x244 wilc_bus_probe+0x2b8/0x398 spi_probe+0xb8/0xdc really_probe+0x134/0x588 __driver_probe_device+0xe0/0x288 driver_probe_device+0x60/0x118 __driver_attach+0x1a0/0x29c bus_for_each_dev+0xf0/0x14c bus_add_driver+0x130/0x288 driver_register+0xd4/0x1c0 do_one_initcall+0xfc/0x204 kernel_init_freeable+0x240/0x2a0 kernel_init+0x20/0x144 ret_from_fork+0x14/0x28 It looks like an already existing/dormant issue in the error-managing path of spi probe of the driver (looks like we are trying to unregister a netdevice which has never been registered ?), but since your series triggers it, it should be handled too. > + } > + > return 0; > > netdev_cleanup: > @@ -1074,58 +1092,43 @@ static int wilc_spi_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size) > * > ********************************************/ > > -static int wilc_spi_reset(struct wilc *wilc) > +static const char * > +strbool(bool val) I am not convinced we need a dedicated helper just to print boolean values (and why the super short line break ?) Also, such change should likely be in a separate patch since it generate quite some lines of diff but none of those being about the initial topic > { > - struct spi_device *spi = to_spi_device(wilc->dev); > - struct wilc_spi *spi_priv = wilc->bus_data; > - int result; > - > - result = wilc_spi_special_cmd(wilc, CMD_RESET); > - if (result && !spi_priv->probing_crc) > - dev_err(&spi->dev, "Failed cmd reset\n"); > - > - return result; > -} > - > -static bool wilc_spi_is_init(struct wilc *wilc) > -{ > - struct wilc_spi *spi_priv = wilc->bus_data; > - > - return spi_priv->isinit; > + return val ? "on" : "off"; > } > > -static int wilc_spi_deinit(struct wilc *wilc) > +static int wilc_validate_chipid(struct wilc *wilc) > { > - struct wilc_spi *spi_priv = wilc->bus_data; > + struct spi_device *spi = to_spi_device(wilc->dev); > + u32 chipid, base_id; > + int ret; > > - spi_priv->isinit = false; > - wilc_wlan_power(wilc, false); > + /* > + * make sure can read chip id without protocol error > + */ > + ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid); > + if (ret) { > + dev_err(&spi->dev, "Fail cmd read chip id...\n"); > + return ret; > + } > + base_id = chipid & ~WILC_CHIP_REV_FIELD; > + if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) { - WILC3000 is currently not supported (yet) by the upstream driver, so there is no reason to validate its presence if we can not handle it later. Any mention of it should then be removed from this series - I see that there is already a helper to handle masking and check chip id in wlan.c (is_wilc1000). You should likely reuse this/avoid the duplication > + dev_err(&spi->dev, "Unknown chip id 0x%x\n", chipid); > + return ret; > + } > + dev_info(&spi->dev, "Detected chip id 0x%x (crc7=%s, crc16=%s)\n", > + chipid, strbool(enable_crc7), strbool(enable_crc16)); > return 0; > } > > -static int wilc_spi_init(struct wilc *wilc, bool resume) > +static int wilc_spi_configure_bus_protocol(struct wilc *wilc) > { > struct spi_device *spi = to_spi_device(wilc->dev); > struct wilc_spi *spi_priv = wilc->bus_data; > u32 reg; > - u32 chipid; > int ret, i; > > - if (spi_priv->isinit) { > - /* Confirm we can read chipid register without error: */ > - ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid); > - if (ret == 0) > - return 0; > - > - dev_err(&spi->dev, "Fail cmd read chip id...\n"); > - } > - > - wilc_wlan_power(wilc, true); > - > - /* > - * configure protocol > - */ > - > /* > * Infer the CRC settings that are currently in effect. This > * is necessary because we can't be sure that the chip has > @@ -1176,12 +1179,54 @@ static int wilc_spi_init(struct wilc *wilc, bool resume) > > spi_priv->probing_crc = false; > > - /* > - * make sure can read chip id without protocol error > - */ > - ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid); > + return 0; > +} > + > +static int wilc_spi_reset(struct wilc *wilc) > +{ > + struct spi_device *spi = to_spi_device(wilc->dev); > + struct wilc_spi *spi_priv = wilc->bus_data; > + int result; > + > + result = wilc_spi_special_cmd(wilc, CMD_RESET); > + if (result && !spi_priv->probing_crc) > + dev_err(&spi->dev, "Failed cmd reset\n"); > + > + return result; > +} > + > +static bool wilc_spi_is_init(struct wilc *wilc) > +{ > + struct wilc_spi *spi_priv = wilc->bus_data; > + > + return spi_priv->isinit; > +} > + > +static int wilc_spi_deinit(struct wilc *wilc) > +{ > + struct wilc_spi *spi_priv = wilc->bus_data; > + > + spi_priv->isinit = false; > + wilc_wlan_power(wilc, false); > + return 0; > +} > + > +static int wilc_spi_init(struct wilc *wilc, bool resume) > +{ > + struct wilc_spi *spi_priv = wilc->bus_data; > + int ret; > + > + if (spi_priv->isinit) { Ok, so indeed in this new version of the patch, the flag still makes sense for upper layers. > + /* Confirm we can read chipid register without error: */ > + if (wilc_validate_chipid(wilc) == 0) > + return 0; > + } > + > + wilc_wlan_power(wilc, true); > + > + ret = wilc_spi_configure_bus_protocol(wilc); > if (ret) { > - dev_err(&spi->dev, "Fail cmd read chip id...\n"); > + wilc_wlan_power(wilc, false); > return ret; > } > > diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h b/drivers/net/wireless/microchip/wilc1000/wlan.h > index a72cd5cac81d..43dda9f0d9ca 100644 > --- a/drivers/net/wireless/microchip/wilc1000/wlan.h > +++ b/drivers/net/wireless/microchip/wilc1000/wlan.h > @@ -182,6 +182,7 @@ > #define WILC_CORTUS_BOOT_FROM_IRAM 0x71 > > #define WILC_1000_BASE_ID 0x100000 > +#define WILC_3000_BASE_ID 0x300000 See comment above for WILC3000 > > #define WILC_1000_BASE_ID_2A 0x1002A0 > #define WILC_1000_BASE_ID_2A_REV1 (WILC_1000_BASE_ID_2A + 1)
On 1/23/24 12:06, Kalle Valo wrote: > Alexis Lothoré <alexis.lothore@bootlin.com> writes: > >> On 1/22/24 23:03, David Mosberger-Tang wrote: >>> Previously, the driver created a net device (typically wlan0) as soon >>> as the module was loaded. This commit changes the driver to follow >>> normal Linux convention of creating the net device only when bus >>> probing detects a supported chip. >> >> I would gladly help review/test the patch, but please give us some time between >> versions to take a look (even if you can mention if you found issues yourself). >> Also, each version should be a separate thread, bearing the new version in the >> "Subject" line. >> Additionally (to answer your cover letter), the patches must target the wireless >> branches (likely wireless-testing), not linux-next >> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide) > > Actually wireless-next is preferred for the baseline (unless it's a fix > going to -rc releases): > > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git/ Oh, ok, thanks for the correction, I may have misinterpreted the wiki then
On 1/23/24 13:59, Kalle Valo wrote: > Alexis Lothoré <alexis.lothore@bootlin.com> writes: > >> On 1/23/24 12:06, Kalle Valo wrote: >>> Alexis Lothoré <alexis.lothore@bootlin.com> writes: >>> >>>> On 1/22/24 23:03, David Mosberger-Tang wrote: >>>>> Previously, the driver created a net device (typically wlan0) as soon >>>>> as the module was loaded. This commit changes the driver to follow >>>>> normal Linux convention of creating the net device only when bus >>>>> probing detects a supported chip. >>>> >>>> I would gladly help review/test the patch, but please give us some time between >>>> versions to take a look (even if you can mention if you found issues yourself). >>>> Also, each version should be a separate thread, bearing the new version in the >>>> "Subject" line. >>>> Additionally (to answer your cover letter), the patches must target the wireless >>>> branches (likely wireless-testing), not linux-next >>>> (https://wireless.wiki.kernel.org/en/developers/documentation/git-guide) >>> >>> Actually wireless-next is preferred for the baseline (unless it's a fix >>> going to -rc releases): >>> >>> https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git/ >> >> Oh, ok, thanks for the correction, I may have misinterpreted the wiki then > > Ah, we should update that page. That page was written before we had > common wireless and wireless-next trees. > > I don't know Johannes thoughts on this but my recommendation for > baseline: > > * use wireless tree for important fixes going to -rc releases > > * for other patches use either driver specific tree (eg. iwlwifi, mt76, > ath) or wireless-next (if no driver specific tree available) > > * for automated testing etc. use wireless-testing as it's a merge of > wireless and wireless-next and contains all latest code Thanks for the details. I'll wait a bit in case Johannes or anyone else wants to add anything, then I can take care of updating the corresponding page
On 1/23/24 18:39, David Mosberger-Tang wrote: > On Tue, 2024-01-23 at 11:18 +0100, Alexis Lothoré wrote: >> On 1/22/24 23:03, David Mosberger-Tang wrote: [...] >> I have a working wilc-over-spi setup with which I can easily unplug the module, >> so I gave a try to your series, and while the lack of chip detect indeed makes >> the netdevice registration not executed, I've got a nasty kasan warning: >> >> driver_probe_device from __driver_attach+0x1a0/0x29c >> >> >> >> [141/1863] >> __driver_attach from bus_for_each_dev+0xf0/0x14c >> bus_for_each_dev from bus_add_driver+0x130/0x288 >> bus_add_driver from driver_register+0xd4/0x1c0 >> driver_register from do_one_initcall+0xfc/0x204 >> do_one_initcall from kernel_init_freeable+0x240/0x2a0 >> kernel_init_freeable from kernel_init+0x20/0x144 >> kernel_init from ret_from_fork+0x14/0x28 >> Exception stack(0xc3163fb0 to 0xc3163ff8) >> 3fa0: 00000000 00000000 00000000 00000000 >> 3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 >> 3fe0: 00000000 00000000 00000000 00000000 00000013 00000000 >> >> Allocated by task 1: >> kasan_set_track+0x3c/0x5c >> __kasan_kmalloc+0x8c/0x94 >> __kmalloc_node+0x64/0x184 >> kvmalloc_node+0x48/0x114 >> alloc_netdev_mqs+0x68/0x664 >> alloc_etherdev_mqs+0x28/0x34 >> wilc_netdev_ifc_init+0x34/0x37c >> wilc_cfg80211_init+0x278/0x330 >> wilc_bus_probe+0xb4/0x398 >> spi_probe+0xb8/0xdc >> really_probe+0x134/0x588 >> __driver_probe_device+0xe0/0x288 >> driver_probe_device+0x60/0x118 >> __driver_attach+0x1a0/0x29c >> bus_for_each_dev+0xf0/0x14c >> bus_add_driver+0x130/0x288 >> driver_register+0xd4/0x1c0 >> do_one_initcall+0xfc/0x204 >> kernel_init_freeable+0x240/0x2a0 >> kernel_init+0x20/0x144 >> ret_from_fork+0x14/0x28 >> >> Freed by task 1: >> kasan_set_track+0x3c/0x5c >> kasan_save_free_info+0x30/0x3c >> __kasan_slab_free+0xe4/0x12c >> __kmem_cache_free+0x94/0x1cc >> device_release+0x54/0xf8 >> kobject_put+0xf4/0x238 >> netdev_run_todo+0x414/0x7dc >> wilc_netdev_cleanup+0xe4/0x244 >> wilc_bus_probe+0x2b8/0x398 >> spi_probe+0xb8/0xdc >> really_probe+0x134/0x588 >> __driver_probe_device+0xe0/0x288 >> driver_probe_device+0x60/0x118 >> __driver_attach+0x1a0/0x29c >> bus_for_each_dev+0xf0/0x14c >> bus_add_driver+0x130/0x288 >> driver_register+0xd4/0x1c0 >> do_one_initcall+0xfc/0x204 >> kernel_init_freeable+0x240/0x2a0 >> kernel_init+0x20/0x144 >> ret_from_fork+0x14/0x28 >> >> It looks like an already existing/dormant issue in the error-managing path of >> spi probe of the driver (looks like we are trying to unregister a netdevice >> which has never been registered ?), but since your series triggers it, it should >> be handled too. > > I need help interpreting this. What does KASAN actually complain about? A > double free or something else? I see that the kasan dump from my last email is truncated, but the first line clearly mentions a use-after-free: ================================================================== BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 Read of size 4 at addr c3c91ce8 by task swapper/1 CPU: 0 PID: 1 Comm: swapper Not tainted 6.7.0-wt+ #843 Hardware name: Atmel SAMA5 unwind_backtrace from show_stack+0x18/0x1c show_stack from dump_stack_lvl+0x34/0x48 dump_stack_lvl from print_report+0x154/0x500 print_report from kasan_report+0xd8/0x100 kasan_report from wilc_netdev_cleanup+0x294/0x2c0 wilc_netdev_cleanup from wilc_bus_probe+0x2b8/0x398 wilc_bus_probe from spi_probe+0xb8/0xdc spi_probe from really_probe+0x134/0x588 really_probe from __driver_probe_device+0xe0/0x288 __driver_probe_device from driver_probe_device+0x60/0x118 driver_probe_device from __driver_attach+0x1a0/0x29c __driver_attach from bus_for_each_dev+0xf0/0x14c bus_for_each_dev from bus_add_driver+0x130/0x288 bus_add_driver from driver_register+0xd4/0x1c0 driver_register from do_one_initcall+0xfc/0x204 do_one_initcall from kernel_init_freeable+0x240/0x2a0 kernel_init_freeable from kernel_init+0x20/0x144 kernel_init from ret_from_fork+0x14/0x28 Not sure though what's wrong without digging further. > register_netdev() does get called (through wilc_cfg80211_init()) and then when > the chip detect fails, unregister_netdev() gets called (from > wilc_netdev_cleanup()), so I don't see any obvious issues, but there is a lot of > other stuff going on there that I'm not familiar with. My bad, your statement made me realize I overlooked things here: aside from the kasan warning, your patch makes the probe function do the following steps: - create and register (wiphy and) netdevice - check if chip is detected on bus - unregister/clean up everything if chip does not respond There's no point in pre-registering the netdevice so early if we add an error path due to chip being absent, I would even say that the whole point of your series is to prevent registering the device if chip can not be accessed. So IMHO chip detection should be done before trying to register the netdevice. It will prevent all the complications due to the whole reverse unregistration (and all weird issues that can occur because of device being registered while not being fully ready) >>> + if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) { >> >> - WILC3000 is currently not supported (yet) by the upstream driver, so there is >> no reason to validate its presence if we can not handle it later. Any mention of >> it should then be removed from this series > > Oh, I didn't realize that. I was just going off of this web page: > > https://www.microchip.com/en-us/software-library/wilc1000_3000_linux_driver Understood, but again, your patch must be based on upstream trees :)
On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote: > On 1/23/24 18:39, David Mosberger-Tang wrote: > > > > > > What does KASAN actually complain about? A double free or something else? > > I see that the kasan dump from my last email is truncated, but the first line > clearly mentions a use-after-free: > > ================================================================== > BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 Ah, that's helpful, thanks! Can you map wilc_netdev_cleanup+0x294 to the corresponding source line? Are you on ARM64 or something else? --david
Alexis, On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote: > ================================================================== > BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 > Read of size 4 at addr c3c91ce8 by task swapper/1 OK, I think I see what's going on: it's the list traversal. Here is what wilc_netdev_cleanup() does: list_for_each_entry_rcu(vif, &wilc->vif_list, list) { if (vif->ndev) unregister_netdev(vif->ndev); } The problem is that "vif" is the private part of the netdev, so when the netdev is freed, the vif structure is no longer valid and list_for_each_entry_rcu() ends up dereferencing a dangling pointer. Ajay or Alexis, could you propose a fix for this - this is not something I'm familiar with. Thanks! --david
On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote: > Alexis, > > On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote: > > ================================================================== > > BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 > > Read of size 4 at addr c3c91ce8 by task swapper/1 > > OK, I think I see what's going on: it's the list traversal. Here is what > wilc_netdev_cleanup() does: > > list_for_each_entry_rcu(vif, &wilc->vif_list, list) { > if (vif->ndev) > unregister_netdev(vif->ndev); > } > > The problem is that "vif" is the private part of the netdev, so when the netdev > is freed, the vif structure is no longer valid and list_for_each_entry_rcu() > ends up dereferencing a dangling pointer. > > Ajay or Alexis, could you propose a fix for this - this is not something I'm > familiar with. Actually, after staring at the code a little longer, is there anything wrong with delaying the unregister_netdev() call until after the vif has been removed from the vif-list? Something along the lines of the below. --david diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.c b/drivers/net/wireless/microchip/wilc1000/netdev.c index 0bf6aef4661e..e78e7d971243 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.c +++ b/drivers/net/wireless/microchip/wilc1000/netdev.c @@ -884,7 +884,7 @@ static const struct net_device_ops wilc_netdev_ops = { void wilc_netdev_cleanup(struct wilc *wilc) { struct wilc_vif *vif; - int srcu_idx, ifc_cnt = 0; + int ifc_cnt = 0; if (!wilc) return; @@ -894,13 +894,6 @@ void wilc_netdev_cleanup(struct wilc *wilc) wilc->firmware = NULL; } - srcu_idx = srcu_read_lock(&wilc->srcu); - list_for_each_entry_rcu(vif, &wilc->vif_list, list) { - if (vif->ndev) - unregister_netdev(vif->ndev); - } - srcu_read_unlock(&wilc->srcu, srcu_idx); - wilc_wfi_deinit_mon_interface(wilc, false); destroy_workqueue(wilc->hif_workqueue); @@ -918,6 +911,10 @@ void wilc_netdev_cleanup(struct wilc *wilc) mutex_unlock(&wilc->vif_mutex); synchronize_srcu(&wilc->srcu); ifc_cnt++; + + if (vif->ndev) + /* vif gets freed as part of this call: */ + unregister_netdev(vif->ndev); } wilc_wlan_cfg_deinit(wilc);
On 1/24/24 11:45, David Mosberger-Tang wrote: > EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe > > On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote: >> Alexis, >> >> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote: >>> ================================================================== >>> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 >>> Read of size 4 at addr c3c91ce8 by task swapper/1 >> >> OK, I think I see what's going on: it's the list traversal. Here is what >> wilc_netdev_cleanup() does: >> >> list_for_each_entry_rcu(vif, &wilc->vif_list, list) { >> if (vif->ndev) >> unregister_netdev(vif->ndev); >> } >> >> The problem is that "vif" is the private part of the netdev, so when the netdev >> is freed, the vif structure is no longer valid and list_for_each_entry_rcu() >> ends up dereferencing a dangling pointer. >> >> Ajay or Alexis, could you propose a fix for this - this is not something I'm >> familiar with. > > Actually, after staring at the code a little longer, is there anything wrong > with delaying the unregister_netdev() call until after the vif has been removed > from the vif-list? Something along the lines of the below. I think we need to investigate the actual reason for Kasan warning. First, we need to confirm if this warning exists without the patch(test by simulating a force error in wilc_bus_probe()). When wilc_netdev_cleanup() is also called from wilc_bus_remove(), I believe this warning was not observed. Once it is confirmed, the fix can be done accordingly. Avoiding netdev initialization in wilc_cfg80211_init() would require lot of modification and changes in API call order. IMO the Kasan warning fix should be independent of netdev initialization order and it should free-up the resources for all cases. Suppose if the card is not present, the probe API should clean-up and exit gracefully. For detecting the chip_id, I have created the below patch considering the above scenarios, please check if it makes sense. --- a/drivers/net/wireless/microchip/wilc1000/spi.c +++ b/drivers/net/wireless/microchip/wilc1000/spi.c @@ -225,6 +225,11 @@ static int wilc_bus_probe(struct spi_device *spi) if (ret < 0) goto netdev_cleanup; + if (!is_wilc_chip_connected(wilc)) { + ret = -ENODEV; + goto netdev_cleanup; + } + wilc->rtc_clk = devm_clk_get_optional(&spi->dev, "rtc"); if (IS_ERR(wilc->rtc_clk)) { ret = PTR_ERR(wilc->rtc_clk); diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.c b/drivers/net/wireless/microchip/wilc1000/wlan.c index 6b2f2269ddf8..6f95456b053b 100644 --- a/drivers/net/wireless/microchip/wilc1000/wlan.c +++ b/drivers/net/wireless/microchip/wilc1000/wlan.c @@ -1537,3 +1537,22 @@ int wilc_wlan_init(struct net_device *dev) return ret; } + +bool is_wilc_chip_connected(struct wilc *wl) +{ + u32 chipid = 0; + int ret; + + acquire_bus(wl, WILC_BUS_ACQUIRE_ONLY); + ret = wl->hif_func->hif_init(wl, false); + if (!ret) { + wl->hif_func->hif_read_reg(wl, WILC_CHIPID, &chipid); + wl->hif_func->hif_deinit(wl); + } + release_bus(wl, WILC_BUS_RELEASE_ONLY); + if (ret || !is_wilc1000(chipid)) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(is_wilc_chip_connected); diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h b/drivers/net/wireless/microchip/wilc1000/wlan.h index f02775f7e41f..8585dda0790c 100644 --- a/drivers/net/wireless/microchip/wilc1000/wlan.h +++ b/drivers/net/wireless/microchip/wilc1000/wlan.h @@ -440,4 +440,5 @@ int wilc_send_config_pkt(struct wilc_vif *vif, u8 mode, struct wid *wids, u32 count); int wilc_wlan_init(struct net_device *dev); u32 wilc_get_chipid(struct wilc *wilc, bool update); +bool is_wilc_chip_connected(struct wilc *wilc); #endif -- Regards, Ajay
On 1/25/24 07:23, Ajay.Kathat@microchip.com wrote: > On 1/24/24 11:45, David Mosberger-Tang wrote: >> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe >> >> On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote: >>> Alexis, >>> >>> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote: >>>> ================================================================== >>>> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 >>>> Read of size 4 at addr c3c91ce8 by task swapper/1 Replying a bit late to your initial questions: - I am running an ARM32 platform (SAMA5D27) - for the wilc_netdev_cleanup line, the 0x294 offset indeed points to list_for_each_entry_rcu(vif, &wilc->vif_list, list) in my case, but you seemed to have identified this already. >>> >>> OK, I think I see what's going on: it's the list traversal. Here is what >>> wilc_netdev_cleanup() does: >>> >>> list_for_each_entry_rcu(vif, &wilc->vif_list, list) { >>> if (vif->ndev) >>> unregister_netdev(vif->ndev); >>> } >>> >>> The problem is that "vif" is the private part of the netdev, so when the netdev >>> is freed, the vif structure is no longer valid and list_for_each_entry_rcu() >>> ends up dereferencing a dangling pointer. Your diagnostic sounds relevant :) >>> Ajay or Alexis, could you propose a fix for this - this is not something I'm >>> familiar with. >> >> Actually, after staring at the code a little longer, is there anything wrong >> with delaying the unregister_netdev() call until after the vif has been removed >> from the vif-list? Something along the lines of the below. I guess you _could_ do something like this, but I think you have to be very careful about potential races. If I'm not wrong the following could happen: - you enter the wilc_netdev_cleanup and start removing vifs from list - meanwhile, because your net device is still registered, userspace can start calling concurrently some cgf80211_ops - some of those ops may try to get the vif matching your netdevice from the list, but it is not there anymore Maybe some rtnl or wiphy lock (used from cfg80211 core or net core) will save you from this for some of wilc_netdev_cleanup calls, but I think that won't be true for the one in the error path of the probe chain. > I think we need to investigate the actual reason for Kasan warning. > First, we need to confirm if this warning exists without the patch(test > by simulating a force error in wilc_bus_probe()). When > wilc_netdev_cleanup() is also called from wilc_bus_remove(), I believe > this warning was not observed. Once it is confirmed, the fix can be done > accordingly. It happens too in wilc_bus_remove: echo "spi0.1" > /sys/bus/spi/drivers/wilc1000_spi/unbind ================================================================== BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0xf0/0x244 Read of size 4 at addr c3c91ce8 by task sh/91 CPU: 0 PID: 91 Comm: sh Not tainted 6.7.0-wt+ #845 Hardware name: Atmel SAMA5 unwind_backtrace from show_stack+0x18/0x1c show_stack from dump_stack_lvl+0x34/0x48 dump_stack_lvl from print_report+0x154/0x500 print_report from kasan_report+0xd8/0x100 kasan_report from wilc_netdev_cleanup+0xf0/0x244 wilc_netdev_cleanup from wilc_bus_remove+0x50/0x5c wilc_bus_remove from spi_remove+0x40/0x50 spi_remove from device_release_driver_internal+0x21c/0x2ac device_release_driver_internal from unbind_store+0x70/0xac unbind_store from kernfs_fop_write_iter+0x1a0/0x284 kernfs_fop_write_iter from vfs_write+0x38c/0x6f4 vfs_write from ksys_write+0xd8/0x178 ksys_write from ret_fast_syscall+0x0/0x1c Exception stack(0xc588ffa8 to 0xc588fff0) ffa0: 00000007 004eff68 00000001 004eff68 00000007 00000001 ffc0: 00000007 004eff68 00000001 00000004 00000007 b69546c0 00000020 004ed190 ffe0: 00000004 b6954678 aec3a041 aebd1a26 > Avoiding netdev initialization in wilc_cfg80211_init() would require lot > of modification and changes in API call order. IMO the Kasan warning fix > should be independent of netdev initialization order and it should > free-up the resources for all cases. Suppose if the card is not present, > the probe API should clean-up and exit gracefully. For detecting the > chip_id, I have created the below patch considering the above scenarios, > please check if it makes sense. Agree about the wilc_netdev_cleanup fix being a separated topic, to be handled accordingly. Still, the scenario I mention above, if true, makes me more confident that we should not register at all the netdevice before being able to manage it. Maybe those cases are already handled correctly with some checks to make sure no real crash occurs, but all those checks could be avoided if we did not register the netdevice so early Alexis
On 1/25/24 04:04, Alexis Lothoré wrote: > EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe > > On 1/25/24 07:23, Ajay.Kathat@microchip.com wrote: >> On 1/24/24 11:45, David Mosberger-Tang wrote: >>> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe >>> >>> On Wed, 2024-01-24 at 10:31 -0700, David Mosberger-Tang wrote: >>>> Alexis, >>>> >>>> On Wed, 2024-01-24 at 10:01 +0100, Alexis Lothoré wrote: >>>>> ================================================================== >>>>> BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0x294/0x2c0 >>>>> Read of size 4 at addr c3c91ce8 by task swapper/1 > > Replying a bit late to your initial questions: > - I am running an ARM32 platform (SAMA5D27) > - for the wilc_netdev_cleanup line, the 0x294 offset indeed points to > list_for_each_entry_rcu(vif, &wilc->vif_list, list) in my case, but you seemed > to have identified this already. > >>>> >>>> OK, I think I see what's going on: it's the list traversal. Here is what >>>> wilc_netdev_cleanup() does: >>>> >>>> list_for_each_entry_rcu(vif, &wilc->vif_list, list) { >>>> if (vif->ndev) >>>> unregister_netdev(vif->ndev); >>>> } >>>> >>>> The problem is that "vif" is the private part of the netdev, so when the netdev >>>> is freed, the vif structure is no longer valid and list_for_each_entry_rcu() >>>> ends up dereferencing a dangling pointer. > > Your diagnostic sounds relevant :) > >>>> Ajay or Alexis, could you propose a fix for this - this is not something I'm >>>> familiar with. >>> >>> Actually, after staring at the code a little longer, is there anything wrong >>> with delaying the unregister_netdev() call until after the vif has been removed >>> from the vif-list? Something along the lines of the below. > > I guess you _could_ do something like this, but I think you have to be very > careful about potential races. If I'm not wrong the following could happen: > - you enter the wilc_netdev_cleanup and start removing vifs from list > - meanwhile, because your net device is still registered, userspace can start > calling concurrently some cgf80211_ops IFAIK cfg80211_ops shouldn't get called until the netdev interface is up(ifconfig wlan0 up). In the probe, only the netdev interface is allocated and cfg80211_ops is registered and the cfg80211_ops callback should be called when the interface is up. > - some of those ops may try to get the vif matching your netdevice from the > list, but it is not there anymore > I have not tested this by enabling Kasan configuration so I haven't observe this issue yet. As I understand, the warning(use-after-free) is reported for the below line in wilc_netdev_cleanup() so that must be the code where used-after-free warning is reported. Isn't it. >>>> list_for_each_entry_rcu(vif, &wilc->vif_list, list) { > Maybe some rtnl or wiphy lock (used from cfg80211 core or net core) will save > you from this for some of wilc_netdev_cleanup calls, but I think that won't be > true for the one in the error path of the probe chain. > >> I think we need to investigate the actual reason for Kasan warning. >> First, we need to confirm if this warning exists without the patch(test >> by simulating a force error in wilc_bus_probe()). When >> wilc_netdev_cleanup() is also called from wilc_bus_remove(), I believe >> this warning was not observed. Once it is confirmed, the fix can be done >> accordingly. > > It happens too in wilc_bus_remove: > echo "spi0.1" > /sys/bus/spi/drivers/wilc1000_spi/unbind Okay, so it confirms that this warning is not related to the probe function. > > ================================================================== > BUG: KASAN: slab-use-after-free in wilc_netdev_cleanup+0xf0/0x244 > Read of size 4 at addr c3c91ce8 by task sh/91 > > CPU: 0 PID: 91 Comm: sh Not tainted 6.7.0-wt+ #845 > Hardware name: Atmel SAMA5 > unwind_backtrace from show_stack+0x18/0x1c > show_stack from dump_stack_lvl+0x34/0x48 > dump_stack_lvl from print_report+0x154/0x500 > print_report from kasan_report+0xd8/0x100 > kasan_report from wilc_netdev_cleanup+0xf0/0x244 > wilc_netdev_cleanup from wilc_bus_remove+0x50/0x5c > wilc_bus_remove from spi_remove+0x40/0x50 > spi_remove from device_release_driver_internal+0x21c/0x2ac > device_release_driver_internal from unbind_store+0x70/0xac > unbind_store from kernfs_fop_write_iter+0x1a0/0x284 > kernfs_fop_write_iter from vfs_write+0x38c/0x6f4 > vfs_write from ksys_write+0xd8/0x178 > ksys_write from ret_fast_syscall+0x0/0x1c > Exception stack(0xc588ffa8 to 0xc588fff0) > ffa0: 00000007 004eff68 00000001 004eff68 00000007 00000001 > ffc0: 00000007 004eff68 00000001 00000004 00000007 b69546c0 00000020 004ed190 > ffe0: 00000004 b6954678 aec3a041 aebd1a26 > >> Avoiding netdev initialization in wilc_cfg80211_init() would require lot >> of modification and changes in API call order. IMO the Kasan warning fix >> should be independent of netdev initialization order and it should >> free-up the resources for all cases. Suppose if the card is not present, >> the probe API should clean-up and exit gracefully. For detecting the >> chip_id, I have created the below patch considering the above scenarios, >> please check if it makes sense. > > Agree about the wilc_netdev_cleanup fix being a separated topic, to be handled > accordingly. > Still, the scenario I mention above, if true, makes me more confident that we > should not register at all the netdevice before being able to manage it. Maybe > those cases are already handled correctly with some checks to make sure no real > crash occurs, but all those checks could be avoided if we did not register the > netdevice so early > neering I think this issue is not related to early registration of netdevice since same behavior was observed with "echo "spi0.1" > /sys/bus/spi/drivers/wilc1000_spi/unbind" command. If needed, the netdevice allocation can be delayed until other conditions(resources) are allocated/successful. But it is also possible that the other conditions(resource) are successful but netdevice allocation gets failed, in that case allocating other resources before may not be correct. For the success case, all the condition should succeed. Currently the driver initialization has a order that already invokes "netdev_clean:" in wilc_bus_probe() for failure cases, so it should free up the resources completely. Additionally, this warning was reported at run time(not only in probe). I believe if it is fix in wilc_netdev_cleanup() then it will cover for all the scenarios. Regards, Ajay
On Thu, 2024-01-25 at 12:04 +0100, Alexis Lothoré wrote: > On 1/25/24 07:23, Ajay.Kathat@microchip.com wrote: > > On 1/24/24 11:45, David Mosberger-Tang wrote: > > > > > > > > > > OK, I think I see what's going on: it's the list traversal. Here is what > > > > wilc_netdev_cleanup() does: > > > > > > > > list_for_each_entry_rcu(vif, &wilc->vif_list, list) { > > > > if (vif->ndev) > > > > unregister_netdev(vif->ndev); > > > > } > > > > > > > > The problem is that "vif" is the private part of the netdev, so when the netdev > > > > is freed, the vif structure is no longer valid and list_for_each_entry_rcu() > > > > ends up dereferencing a dangling pointer. > > Your diagnostic sounds relevant :) Yeah, it's definitely what's going on. And it's not just the list traversal: afterwards, wilc_netdev_cleanup() continues to access the vif structure while removing them from the vif-list. I think the original idea of calling unregister_netdev() is probably the right one as, like you said, you want to remove the device from being visible to the user before tearing down anything else. If I understood the problem correctly, the use-after-free caused by this line in wilc_netdev_ifc_init(): ndev->needs_free_netdev = true; This causes unregister_netdev() to implicitly call free_netdev(). Without that code, I think you could call unregister_netdev() early on (as it is right now) and when done with using the vifs, call free_netdev() while avoiding any dangling references. In any case, this is definitely not my area of expertise. I don't fully understand the motivition behind needs_free_netdev, even after reading https://docs.kernel.org/networking/netdevices.html - I suspect the use of that flag has evolved over the years and the docs may not be entirely up-to-date anymore. One driver I looked at (wireless/ath/wil6210/netdev.c) sets needs_free_netdev only for secondary vifs (i.e., all but the first vif). Hopefully someone else on this list can figure out what the right solution is here. Thanks, --david
diff --git a/drivers/net/wireless/microchip/wilc1000/spi.c b/drivers/net/wireless/microchip/wilc1000/spi.c index 77b4cdff73c3..dd6935dc1bc9 100644 --- a/drivers/net/wireless/microchip/wilc1000/spi.c +++ b/drivers/net/wireless/microchip/wilc1000/spi.c @@ -42,7 +42,7 @@ MODULE_PARM_DESC(enable_crc16, #define WILC_SPI_RSP_HDR_EXTRA_DATA 8 struct wilc_spi { - bool isinit; /* true if SPI protocol has been configured */ + bool isinit; /* true if wilc_spi_init was successful */ bool probing_crc; /* true if we're probing chip's CRC config */ bool crc7_enabled; /* true if crc7 is currently enabled */ bool crc16_enabled; /* true if crc16 is currently enabled */ @@ -55,6 +55,8 @@ struct wilc_spi { static const struct wilc_hif_func wilc_hif_spi; static int wilc_spi_reset(struct wilc *wilc); +static int wilc_spi_configure_bus_protocol(struct wilc *wilc); +static int wilc_validate_chipid(struct wilc *wilc); /******************************************** * @@ -232,6 +234,22 @@ static int wilc_bus_probe(struct spi_device *spi) } clk_prepare_enable(wilc->rtc_clk); + /* we need power to configure the bus protocol and to read the chip id: */ + + wilc_wlan_power(wilc, true); + + ret = wilc_spi_configure_bus_protocol(wilc); + + if (ret == 0) + ret = wilc_validate_chipid(wilc); + + wilc_wlan_power(wilc, false); + + if (ret) { + ret = -ENODEV; + goto netdev_cleanup; + } + return 0; netdev_cleanup: @@ -1074,58 +1092,43 @@ static int wilc_spi_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size) * ********************************************/ -static int wilc_spi_reset(struct wilc *wilc) +static const char * +strbool(bool val) { - struct spi_device *spi = to_spi_device(wilc->dev); - struct wilc_spi *spi_priv = wilc->bus_data; - int result; - - result = wilc_spi_special_cmd(wilc, CMD_RESET); - if (result && !spi_priv->probing_crc) - dev_err(&spi->dev, "Failed cmd reset\n"); - - return result; -} - -static bool wilc_spi_is_init(struct wilc *wilc) -{ - struct wilc_spi *spi_priv = wilc->bus_data; - - return spi_priv->isinit; + return val ? "on" : "off"; } -static int wilc_spi_deinit(struct wilc *wilc) +static int wilc_validate_chipid(struct wilc *wilc) { - struct wilc_spi *spi_priv = wilc->bus_data; + struct spi_device *spi = to_spi_device(wilc->dev); + u32 chipid, base_id; + int ret; - spi_priv->isinit = false; - wilc_wlan_power(wilc, false); + /* + * make sure can read chip id without protocol error + */ + ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid); + if (ret) { + dev_err(&spi->dev, "Fail cmd read chip id...\n"); + return ret; + } + base_id = chipid & ~WILC_CHIP_REV_FIELD; + if (base_id != WILC_1000_BASE_ID && base_id != WILC_3000_BASE_ID) { + dev_err(&spi->dev, "Unknown chip id 0x%x\n", chipid); + return ret; + } + dev_info(&spi->dev, "Detected chip id 0x%x (crc7=%s, crc16=%s)\n", + chipid, strbool(enable_crc7), strbool(enable_crc16)); return 0; } -static int wilc_spi_init(struct wilc *wilc, bool resume) +static int wilc_spi_configure_bus_protocol(struct wilc *wilc) { struct spi_device *spi = to_spi_device(wilc->dev); struct wilc_spi *spi_priv = wilc->bus_data; u32 reg; - u32 chipid; int ret, i; - if (spi_priv->isinit) { - /* Confirm we can read chipid register without error: */ - ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid); - if (ret == 0) - return 0; - - dev_err(&spi->dev, "Fail cmd read chip id...\n"); - } - - wilc_wlan_power(wilc, true); - - /* - * configure protocol - */ - /* * Infer the CRC settings that are currently in effect. This * is necessary because we can't be sure that the chip has @@ -1176,12 +1179,54 @@ static int wilc_spi_init(struct wilc *wilc, bool resume) spi_priv->probing_crc = false; - /* - * make sure can read chip id without protocol error - */ - ret = wilc_spi_read_reg(wilc, WILC_CHIPID, &chipid); + return 0; +} + +static int wilc_spi_reset(struct wilc *wilc) +{ + struct spi_device *spi = to_spi_device(wilc->dev); + struct wilc_spi *spi_priv = wilc->bus_data; + int result; + + result = wilc_spi_special_cmd(wilc, CMD_RESET); + if (result && !spi_priv->probing_crc) + dev_err(&spi->dev, "Failed cmd reset\n"); + + return result; +} + +static bool wilc_spi_is_init(struct wilc *wilc) +{ + struct wilc_spi *spi_priv = wilc->bus_data; + + return spi_priv->isinit; +} + +static int wilc_spi_deinit(struct wilc *wilc) +{ + struct wilc_spi *spi_priv = wilc->bus_data; + + spi_priv->isinit = false; + wilc_wlan_power(wilc, false); + return 0; +} + +static int wilc_spi_init(struct wilc *wilc, bool resume) +{ + struct wilc_spi *spi_priv = wilc->bus_data; + int ret; + + if (spi_priv->isinit) { + /* Confirm we can read chipid register without error: */ + if (wilc_validate_chipid(wilc) == 0) + return 0; + } + + wilc_wlan_power(wilc, true); + + ret = wilc_spi_configure_bus_protocol(wilc); if (ret) { - dev_err(&spi->dev, "Fail cmd read chip id...\n"); + wilc_wlan_power(wilc, false); return ret; } diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.h b/drivers/net/wireless/microchip/wilc1000/wlan.h index a72cd5cac81d..43dda9f0d9ca 100644 --- a/drivers/net/wireless/microchip/wilc1000/wlan.h +++ b/drivers/net/wireless/microchip/wilc1000/wlan.h @@ -182,6 +182,7 @@ #define WILC_CORTUS_BOOT_FROM_IRAM 0x71 #define WILC_1000_BASE_ID 0x100000 +#define WILC_3000_BASE_ID 0x300000 #define WILC_1000_BASE_ID_2A 0x1002A0 #define WILC_1000_BASE_ID_2A_REV1 (WILC_1000_BASE_ID_2A + 1)
Previously, the driver created a net device (typically wlan0) as soon as the module was loaded. This commit changes the driver to follow normal Linux convention of creating the net device only when bus probing detects a supported chip. Signed-off-by: David Mosberger-Tang <davidm@egauge.net> --- V2 -> V3: Add missing forward declarations, actually :-( drivers/net/wireless/microchip/wilc1000/spi.c | 133 ++++++++++++------ .../net/wireless/microchip/wilc1000/wlan.h | 1 + 2 files changed, 90 insertions(+), 44 deletions(-)