Message ID | 1627038402-114183-8-git-send-email-liudongdong3@huawei.com |
---|---|
State | New |
Headers | show |
Series | PCI: Enable 10-Bit tag support for PCIe devices | expand |
On 2021/7/24 0:58, kernel test robot wrote: > Hi Dongdong, > > Thank you for the patch! Perhaps something to improve: > > [auto build test WARNING on pci/next] > [also build test WARNING on linuxtv-media/master linus/master v5.14-rc2 next-20210723] > [If your patch is applied to the wrong git tree, kindly drop us a note. > And when submitting patch, we suggest to use '--base' as documented in > https://git-scm.com/docs/git-format-patch] > > url: https://github.com/0day-ci/linux/commits/Dongdong-Liu/PCI-Enable-10-Bit-tag-support-for-PCIe-devices/20210723-190930 > base: https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git next > config: x86_64-randconfig-b001-20210723 (attached as .config) > compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 9625ca5b602616b2f5584e8a49ba93c52c141e40) > reproduce (this is a W=1 build): > wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross > chmod +x ~/bin/make.cross > # install x86_64 cross compiling tool for clang build > # apt-get install binutils-x86-64-linux-gnu > # https://github.com/0day-ci/linux/commit/2ff0b803971a3df5815c96c5c4874f4eef64fa2f > git remote add linux-review https://github.com/0day-ci/linux > git fetch --no-tags linux-review Dongdong-Liu/PCI-Enable-10-Bit-tag-support-for-PCIe-devices/20210723-190930 > git checkout 2ff0b803971a3df5815c96c5c4874f4eef64fa2f > # save the attached .config to linux build tree > mkdir build_dir > COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/pci/ > > If you fix the issue, kindly add following tag as appropriate > Reported-by: kernel test robot <lkp@intel.com> > > All warnings (new ones prefixed by >>): > > drivers/pci/pci.c:6618:34: error: expected identifier > pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, > ^ > include/uapi/linux/pci_regs.h:657:26: note: expanded from macro 'PCI_EXP_DEVCTL2' > #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ > ^ >>> drivers/pci/pci.c:6618:2: warning: declaration specifier missing, defaulting to 'int' > pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, > ^ > int > drivers/pci/pci.c:6618:28: error: this function declaration is not a prototype [-Werror,-Wstrict-prototypes] > pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, > ^ > drivers/pci/pci.c:6618:2: error: conflicting types for 'pcie_capability_clear_word' > pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, > ^ > include/linux/pci.h:1161:19: note: previous definition is here > static inline int pcie_capability_clear_word(struct pci_dev *dev, int pos, > ^ > drivers/pci/pci.c:6621:2: error: expected parameter declarator > pci_info(dev, "disabled 10-Bit Tag Requester\n"); > ^ > include/linux/pci.h:2472:46: note: expanded from macro 'pci_info' > #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) > ^ > drivers/pci/pci.c:6621:2: error: expected ')' > include/linux/pci.h:2472:46: note: expanded from macro 'pci_info' > #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) > ^ > drivers/pci/pci.c:6621:2: note: to match this '(' > include/linux/pci.h:2472:37: note: expanded from macro 'pci_info' > #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) > ^ > include/linux/dev_printk.h:118:11: note: expanded from macro 'dev_info' > _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) > ^ > drivers/pci/pci.c:6621:2: warning: declaration specifier missing, defaulting to 'int' > pci_info(dev, "disabled 10-Bit Tag Requester\n"); > ^ > int > include/linux/pci.h:2472:37: note: expanded from macro 'pci_info' > #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) > ^ > include/linux/dev_printk.h:118:2: note: expanded from macro 'dev_info' > _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) > ^ > drivers/pci/pci.c:6621:2: error: this function declaration is not a prototype [-Werror,-Wstrict-prototypes] > include/linux/pci.h:2472:37: note: expanded from macro 'pci_info' > #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) > ^ > include/linux/dev_printk.h:118:11: note: expanded from macro 'dev_info' > _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) > ^ > drivers/pci/pci.c:6621:2: error: conflicting types for '_dev_info' > include/linux/pci.h:2472:37: note: expanded from macro 'pci_info' > #define pci_info(pdev, fmt, arg...) dev_info(&(pdev)->dev, fmt, ##arg) > ^ > include/linux/dev_printk.h:118:2: note: expanded from macro 'dev_info' > _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) > ^ > include/linux/dev_printk.h:56:6: note: previous declaration is here > void _dev_info(const struct device *dev, const char *fmt, ...); > ^ > drivers/pci/pci.c:6622:1: error: extraneous closing brace ('}') > } > ^ > 2 warnings and 8 errors generated. > > > vim +/int +6618 drivers/pci/pci.c > > 6580 > 6581 if (!disable_10bit_tag_param) > 6582 return; > 6583 > 6584 p = disable_10bit_tag_param; > 6585 while (*p) { > 6586 ret = pci_dev_str_match(dev, p, &p); > 6587 if (ret < 0) { > 6588 pr_info_once("PCI: Can't parse disable_10bit_tag parameter: %s\n", > 6589 disable_10bit_tag_param); > 6590 > 6591 break; > 6592 } else if (ret == 1) { > 6593 /* Found a match */ > 6594 break; > 6595 } > 6596 > 6597 if (*p != ';' && *p != ',') { > 6598 /* End of param or invalid format */ > 6599 break; > 6600 } > 6601 p++; > 6602 } > 6603 > 6604 if (ret != 1) > 6605 return; > 6606 > 6607 #ifdef CONFIG_PCI_IOV > 6608 if (dev->is_virtfn) { > 6609 iov = dev->physfn->sriov; > 6610 iov->ctrl &= ~PCI_SRIOV_CTRL_VF_10BIT_TAG_REQ_EN; > 6611 pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, > 6612 iov->ctrl); > 6613 pci_info(dev, "disabled PF SRIOV 10-Bit Tag Requester\n"); > 6614 return; > 6615 #endif > 6616 } I made a mistake here, will fix. Thanks, Dongdong > 6617 >> 6618 pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, > 6619 PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN); > 6620 > 6621 pci_info(dev, "disabled 10-Bit Tag Requester\n"); > 6622 } > 6623 > > --- > 0-DAY CI Kernel Test Service, Intel Corporation > https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org >
On Fri, Jul 23, 2021 at 10:20:50AM -0600, Logan Gunthorpe wrote: > > > > On 2021-07-23 5:32 a.m., Leon Romanovsky wrote: > > On Fri, Jul 23, 2021 at 07:06:41PM +0800, Dongdong Liu wrote: > >> PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports > >> sending Requests to other Endpoints (as opposed to host memory), the > >> Endpoint must not send 10-Bit Tag Requests to another given Endpoint > >> unless an implementation-specific mechanism determines that the Endpoint > >> supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" > >> parameter to disable 10-Bit Tag Requester if the peer device does not > >> support the 10-Bit Tag Completer. This will make P2P traffic safe. > >> > >> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> > >> --- > >> Documentation/admin-guide/kernel-parameters.txt | 7 ++++ > >> drivers/pci/pci.c | 56 +++++++++++++++++++++++++ > >> drivers/pci/pci.h | 1 + > >> drivers/pci/pcie/portdrv_pci.c | 13 +++--- > >> drivers/pci/probe.c | 9 ++-- > >> 5 files changed, 78 insertions(+), 8 deletions(-) > >> > >> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt > >> index bdb2200..c2c4585 100644 > >> --- a/Documentation/admin-guide/kernel-parameters.txt > >> +++ b/Documentation/admin-guide/kernel-parameters.txt > >> @@ -4019,6 +4019,13 @@ > >> bridges without forcing it upstream. Note: > >> this removes isolation between devices and > >> may put more devices in an IOMMU group. > >> + disable_10bit_tag=<pci_dev>[; ...] > >> + Specify one or more PCI devices (in the format > >> + specified above) separated by semicolons. > >> + Disable 10-Bit Tag Requester if the peer > >> + device does not support the 10-Bit Tag > >> + Completer.This will make P2P traffic safe. > > > > I can't imagine more awkward user experience than such kernel parameter. > > > > As a user, I will need to boot the system, hope for the best that system > > works, write down all PCI device numbers, guess which one doesn't work > > properly, update grub with new command line argument and reboot the > > system. Any HW change and this dance should be repeated. > > There are already two such PCI parameters with this pattern and they are > not that awkward. pci_dev may be specified with either vendor/device IDS > or with a path of BDFs (which protects against renumbering). Unfortunately, in the real world, BDF is not so stable. It changes with addition of new hardware, BIOS upgrades and even broken servers. Vendor/device IDs doesn't work if you have multiple devices of same vendor in the system. > > This flag is only useful in P2PDMA traffic, and if the user attempts > such a transfer, it prints a warning (see the next patch) with the exact > parameter that needs to be added to the command line. Dongdong citied PCI spec and it was very clear - don't enable this feature unless you clearly know that it is safe to enable. This is completely opposite to the proposal here - always enable and disable if something is printed to the dmesg. > > This has worked well for disable_acs_redir and was used for > resource_alignment before that for quite some time. So save a better > suggestion I think this is more than acceptable. I don't know about other parameters and their history, but we are not in 90s anymore and addition of modules parameters (for the PCI it is kernel cmdline arguments) are better to be changed to some configuration tool/sysfs. Even FW upgrade with such kernel parameter can be problematic. Thanks > > Logan
On 2021-07-25 12:39 a.m., Leon Romanovsky wrote: > On Fri, Jul 23, 2021 at 10:20:50AM -0600, Logan Gunthorpe wrote: >> >> >> >> On 2021-07-23 5:32 a.m., Leon Romanovsky wrote: >>> On Fri, Jul 23, 2021 at 07:06:41PM +0800, Dongdong Liu wrote: >>>> PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports >>>> sending Requests to other Endpoints (as opposed to host memory), the >>>> Endpoint must not send 10-Bit Tag Requests to another given Endpoint >>>> unless an implementation-specific mechanism determines that the Endpoint >>>> supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" >>>> parameter to disable 10-Bit Tag Requester if the peer device does not >>>> support the 10-Bit Tag Completer. This will make P2P traffic safe. >>>> >>>> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> >>>> --- >>>> Documentation/admin-guide/kernel-parameters.txt | 7 ++++ >>>> drivers/pci/pci.c | 56 +++++++++++++++++++++++++ >>>> drivers/pci/pci.h | 1 + >>>> drivers/pci/pcie/portdrv_pci.c | 13 +++--- >>>> drivers/pci/probe.c | 9 ++-- >>>> 5 files changed, 78 insertions(+), 8 deletions(-) >>>> >>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt >>>> index bdb2200..c2c4585 100644 >>>> --- a/Documentation/admin-guide/kernel-parameters.txt >>>> +++ b/Documentation/admin-guide/kernel-parameters.txt >>>> @@ -4019,6 +4019,13 @@ >>>> bridges without forcing it upstream. Note: >>>> this removes isolation between devices and >>>> may put more devices in an IOMMU group. >>>> + disable_10bit_tag=<pci_dev>[; ...] >>>> + Specify one or more PCI devices (in the format >>>> + specified above) separated by semicolons. >>>> + Disable 10-Bit Tag Requester if the peer >>>> + device does not support the 10-Bit Tag >>>> + Completer.This will make P2P traffic safe. >>> >>> I can't imagine more awkward user experience than such kernel parameter. >>> >>> As a user, I will need to boot the system, hope for the best that system >>> works, write down all PCI device numbers, guess which one doesn't work >>> properly, update grub with new command line argument and reboot the >>> system. Any HW change and this dance should be repeated. >> >> There are already two such PCI parameters with this pattern and they are >> not that awkward. pci_dev may be specified with either vendor/device IDS >> or with a path of BDFs (which protects against renumbering). > > Unfortunately, in the real world, BDF is not so stable. It changes with > addition of new hardware, BIOS upgrades and even broken servers. That's why it supports using a *path* of BDFs which tends not to catch the wrong device if the topology changes. > Vendor/device IDs doesn't work if you have multiple devices of same > vendor in the system. Yes, but it's fine for some use cases. That's why there's a range of options. >> >> This flag is only useful in P2PDMA traffic, and if the user attempts >> such a transfer, it prints a warning (see the next patch) with the exact >> parameter that needs to be added to the command line. > > Dongdong citied PCI spec and it was very clear - don't enable this > feature unless you clearly know that it is safe to enable. This is > completely opposite to the proposal here - always enable and disable > if something is printed to the dmesg. Quoting from patch 4: "For platforms where the RC supports 10-Bit Tag Completer capability, it is highly recommended for platform firmware or operating software that configures PCIe hierarchies to Set the 10-Bit Tag Requester Enable bit automatically in Endpoints with 10-Bit Tag Requester capability. This enables the important class of 10-Bit Tag capable adapters that send Memory Read Requests only to host memory." Notice the last sentence. It's saying that devices who only talk to host memory should have 10-bit tags enabled. In the kernel we call devices that talk to things besides host memory "P2PDMA". So the spec is saying not to enable 10bit tags for devices participating in P2PDMA. The kernel needs a way to allow users to do that. The kernel parameter only stops the feature from being enabled for a specific device, and the only use-case is P2PDMA which is not that common and requires the user to be aware of their topology. So I really don't think this is that big a problem. >> >> This has worked well for disable_acs_redir and was used for >> resource_alignment before that for quite some time. So save a better >> suggestion I think this is more than acceptable. > > I don't know about other parameters and their history, but we are not in > 90s anymore and addition of modules parameters (for the PCI it is kernel > cmdline arguments) are better to be changed to some configuration tool/sysfs. The problem was that the ACS bits had to be set before the kernel enumerated the devices. The IOMMU code simply was not able to support dynamic adjustments to its groups. I assume changing 10bit tags dynamically is similarly tricky -- but if it's not then, yes a sysfs interface in addition to the kernel parameter would be a good idea. Logan
On Mon, Jul 26, 2021 at 09:48:57AM -0600, Logan Gunthorpe wrote: > > > On 2021-07-25 12:39 a.m., Leon Romanovsky wrote: > > On Fri, Jul 23, 2021 at 10:20:50AM -0600, Logan Gunthorpe wrote: > >> > >> > >> > >> On 2021-07-23 5:32 a.m., Leon Romanovsky wrote: > >>> On Fri, Jul 23, 2021 at 07:06:41PM +0800, Dongdong Liu wrote: > >>>> PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports > >>>> sending Requests to other Endpoints (as opposed to host memory), the > >>>> Endpoint must not send 10-Bit Tag Requests to another given Endpoint > >>>> unless an implementation-specific mechanism determines that the Endpoint > >>>> supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" > >>>> parameter to disable 10-Bit Tag Requester if the peer device does not > >>>> support the 10-Bit Tag Completer. This will make P2P traffic safe. > >>>> > >>>> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> > >>>> --- > >>>> Documentation/admin-guide/kernel-parameters.txt | 7 ++++ > >>>> drivers/pci/pci.c | 56 +++++++++++++++++++++++++ > >>>> drivers/pci/pci.h | 1 + > >>>> drivers/pci/pcie/portdrv_pci.c | 13 +++--- > >>>> drivers/pci/probe.c | 9 ++-- > >>>> 5 files changed, 78 insertions(+), 8 deletions(-) > >>>> > >>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt > >>>> index bdb2200..c2c4585 100644 > >>>> --- a/Documentation/admin-guide/kernel-parameters.txt > >>>> +++ b/Documentation/admin-guide/kernel-parameters.txt > >>>> @@ -4019,6 +4019,13 @@ > >>>> bridges without forcing it upstream. Note: > >>>> this removes isolation between devices and > >>>> may put more devices in an IOMMU group. > >>>> + disable_10bit_tag=<pci_dev>[; ...] > >>>> + Specify one or more PCI devices (in the format > >>>> + specified above) separated by semicolons. > >>>> + Disable 10-Bit Tag Requester if the peer > >>>> + device does not support the 10-Bit Tag > >>>> + Completer.This will make P2P traffic safe. > >>> > >>> I can't imagine more awkward user experience than such kernel parameter. > >>> > >>> As a user, I will need to boot the system, hope for the best that system > >>> works, write down all PCI device numbers, guess which one doesn't work > >>> properly, update grub with new command line argument and reboot the > >>> system. Any HW change and this dance should be repeated. > >> > >> There are already two such PCI parameters with this pattern and they are > >> not that awkward. pci_dev may be specified with either vendor/device IDS > >> or with a path of BDFs (which protects against renumbering). > > > > Unfortunately, in the real world, BDF is not so stable. It changes with > > addition of new hardware, BIOS upgrades and even broken servers. > > That's why it supports using a *path* of BDFs which tends not to catch > the wrong device if the topology changes. > > > Vendor/device IDs doesn't work if you have multiple devices of same > > vendor in the system. > > Yes, but it's fine for some use cases. That's why there's a range of > options. The thing is that you are adding PCI parameter that is applicable to everyone. We probably see different usage models for this feature. In my world, users have thousands of servers that runs 24x7, with VMs on top, some of them perform FW upgrades without stopping anything. The idea that you can reboot such server any time, simply doesn't exist. So if I need to enable/disable this feature for one of the VFs, I will be stuck. > > >> > >> This flag is only useful in P2PDMA traffic, and if the user attempts > >> such a transfer, it prints a warning (see the next patch) with the exact > >> parameter that needs to be added to the command line. > > > > Dongdong citied PCI spec and it was very clear - don't enable this > > feature unless you clearly know that it is safe to enable. This is > > completely opposite to the proposal here - always enable and disable > > if something is printed to the dmesg. > > Quoting from patch 4: > > "For platforms where the RC supports 10-Bit Tag Completer capability, > it is highly recommended for platform firmware or operating software > that configures PCIe hierarchies to Set the 10-Bit Tag Requester Enable > bit automatically in Endpoints with 10-Bit Tag Requester capability. > This enables the important class of 10-Bit Tag capable adapters that > send Memory Read Requests only to host memory." > > Notice the last sentence. It's saying that devices who only talk to host > memory should have 10-bit tags enabled. In the kernel we call devices > that talk to things besides host memory "P2PDMA". So the spec is saying > not to enable 10bit tags for devices participating in P2PDMA. The kernel > needs a way to allow users to do that. The kernel parameter only stops > the feature from being enabled for a specific device, and the only > use-case is P2PDMA which is not that common and requires the user to be > aware of their topology. So I really don't think this is that big a problem. I'm not question the feature and the need of configuration. My concern is just *how* this feature is configured. > > >> > >> This has worked well for disable_acs_redir and was used for > >> resource_alignment before that for quite some time. So save a better > >> suggestion I think this is more than acceptable. > > > > I don't know about other parameters and their history, but we are not in > > 90s anymore and addition of modules parameters (for the PCI it is kernel > > cmdline arguments) are better to be changed to some configuration tool/sysfs. > > The problem was that the ACS bits had to be set before the kernel > enumerated the devices. The IOMMU code simply was not able to support > dynamic adjustments to its groups. I assume changing 10bit tags > dynamically is similarly tricky -- but if it's not then, yes a sysfs > interface in addition to the kernel parameter would be a good idea. I think that it is doable with combination of drivers_autoprobe disable and some sysfs knob to enable/disable this feature before driver bind. It should be very similar to that we did for the dynamic MSI-X, see /sys/bus/pci/devices/.../sriov_vf_msix_count Thanks > > Logan
On 2021/7/26 23:48, Logan Gunthorpe wrote: > > > On 2021-07-25 12:39 a.m., Leon Romanovsky wrote: >> On Fri, Jul 23, 2021 at 10:20:50AM -0600, Logan Gunthorpe wrote: >>> >>> >>> >>> On 2021-07-23 5:32 a.m., Leon Romanovsky wrote: >>>> On Fri, Jul 23, 2021 at 07:06:41PM +0800, Dongdong Liu wrote: >>>>> PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports >>>>> sending Requests to other Endpoints (as opposed to host memory), the >>>>> Endpoint must not send 10-Bit Tag Requests to another given Endpoint >>>>> unless an implementation-specific mechanism determines that the Endpoint >>>>> supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" >>>>> parameter to disable 10-Bit Tag Requester if the peer device does not >>>>> support the 10-Bit Tag Completer. This will make P2P traffic safe. >>>>> >>>>> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> >>>>> --- >>>>> Documentation/admin-guide/kernel-parameters.txt | 7 ++++ >>>>> drivers/pci/pci.c | 56 +++++++++++++++++++++++++ >>>>> drivers/pci/pci.h | 1 + >>>>> drivers/pci/pcie/portdrv_pci.c | 13 +++--- >>>>> drivers/pci/probe.c | 9 ++-- >>>>> 5 files changed, 78 insertions(+), 8 deletions(-) >>>>> >>>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt >>>>> index bdb2200..c2c4585 100644 >>>>> --- a/Documentation/admin-guide/kernel-parameters.txt >>>>> +++ b/Documentation/admin-guide/kernel-parameters.txt >>>>> @@ -4019,6 +4019,13 @@ >>>>> bridges without forcing it upstream. Note: >>>>> this removes isolation between devices and >>>>> may put more devices in an IOMMU group. >>>>> + disable_10bit_tag=<pci_dev>[; ...] >>>>> + Specify one or more PCI devices (in the format >>>>> + specified above) separated by semicolons. >>>>> + Disable 10-Bit Tag Requester if the peer >>>>> + device does not support the 10-Bit Tag >>>>> + Completer.This will make P2P traffic safe. >>>> >>>> I can't imagine more awkward user experience than such kernel parameter. >>>> >>>> As a user, I will need to boot the system, hope for the best that system >>>> works, write down all PCI device numbers, guess which one doesn't work >>>> properly, update grub with new command line argument and reboot the >>>> system. Any HW change and this dance should be repeated. >>> >>> There are already two such PCI parameters with this pattern and they are >>> not that awkward. pci_dev may be specified with either vendor/device IDS >>> or with a path of BDFs (which protects against renumbering). >> >> Unfortunately, in the real world, BDF is not so stable. It changes with >> addition of new hardware, BIOS upgrades and even broken servers. > > That's why it supports using a *path* of BDFs which tends not to catch > the wrong device if the topology changes. > >> Vendor/device IDs doesn't work if you have multiple devices of same >> vendor in the system. > > Yes, but it's fine for some use cases. That's why there's a range of > options. > >>> >>> This flag is only useful in P2PDMA traffic, and if the user attempts >>> such a transfer, it prints a warning (see the next patch) with the exact >>> parameter that needs to be added to the command line. >> >> Dongdong citied PCI spec and it was very clear - don't enable this >> feature unless you clearly know that it is safe to enable. This is >> completely opposite to the proposal here - always enable and disable >> if something is printed to the dmesg. > > Quoting from patch 4: > > "For platforms where the RC supports 10-Bit Tag Completer capability, > it is highly recommended for platform firmware or operating software > that configures PCIe hierarchies to Set the 10-Bit Tag Requester Enable > bit automatically in Endpoints with 10-Bit Tag Requester capability. > This enables the important class of 10-Bit Tag capable adapters that > send Memory Read Requests only to host memory." > > Notice the last sentence. It's saying that devices who only talk to host > memory should have 10-bit tags enabled. In the kernel we call devices > that talk to things besides host memory "P2PDMA". So the spec is saying > not to enable 10bit tags for devices participating in P2PDMA. The kernel > needs a way to allow users to do that. The kernel parameter only stops > the feature from being enabled for a specific device, and the only > use-case is P2PDMA which is not that common and requires the user to be > aware of their topology. So I really don't think this is that big a problem. > >>> >>> This has worked well for disable_acs_redir and was used for >>> resource_alignment before that for quite some time. So save a better >>> suggestion I think this is more than acceptable. >> >> I don't know about other parameters and their history, but we are not in >> 90s anymore and addition of modules parameters (for the PCI it is kernel >> cmdline arguments) are better to be changed to some configuration tool/sysfs. > > The problem was that the ACS bits had to be set before the kernel > enumerated the devices. The IOMMU code simply was not able to support > dynamic adjustments to its groups. I assume changing 10bit tags > dynamically is similarly tricky -- but if it's not then, yes a sysfs > interface in addition to the kernel parameter would be a good idea. PCIe spec 5.0 section 7.5.3.16 Device Control 2 Register 10-Bit Tag Requester Enable says that If software changes the value of this bit while the Function has outstanding Non-Posted Requests, the result is undefined. So 10-Bit Tag Requester Enable should be set before probe the device driver. Thanks, Dongdong > > Logan > . >
On 2021/7/27 19:05, Leon Romanovsky wrote: > On Mon, Jul 26, 2021 at 09:48:57AM -0600, Logan Gunthorpe wrote: >> >> >> On 2021-07-25 12:39 a.m., Leon Romanovsky wrote: >>> On Fri, Jul 23, 2021 at 10:20:50AM -0600, Logan Gunthorpe wrote: >>>> >>>> >>>> >>>> On 2021-07-23 5:32 a.m., Leon Romanovsky wrote: >>>>> On Fri, Jul 23, 2021 at 07:06:41PM +0800, Dongdong Liu wrote: >>>>>> PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports >>>>>> sending Requests to other Endpoints (as opposed to host memory), the >>>>>> Endpoint must not send 10-Bit Tag Requests to another given Endpoint >>>>>> unless an implementation-specific mechanism determines that the Endpoint >>>>>> supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" >>>>>> parameter to disable 10-Bit Tag Requester if the peer device does not >>>>>> support the 10-Bit Tag Completer. This will make P2P traffic safe. >>>>>> >>>>>> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> >>>>>> --- >>>>>> Documentation/admin-guide/kernel-parameters.txt | 7 ++++ >>>>>> drivers/pci/pci.c | 56 +++++++++++++++++++++++++ >>>>>> drivers/pci/pci.h | 1 + >>>>>> drivers/pci/pcie/portdrv_pci.c | 13 +++--- >>>>>> drivers/pci/probe.c | 9 ++-- >>>>>> 5 files changed, 78 insertions(+), 8 deletions(-) >>>>>> >>>>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt >>>>>> index bdb2200..c2c4585 100644 >>>>>> --- a/Documentation/admin-guide/kernel-parameters.txt >>>>>> +++ b/Documentation/admin-guide/kernel-parameters.txt >>>>>> @@ -4019,6 +4019,13 @@ >>>>>> bridges without forcing it upstream. Note: >>>>>> this removes isolation between devices and >>>>>> may put more devices in an IOMMU group. >>>>>> + disable_10bit_tag=<pci_dev>[; ...] >>>>>> + Specify one or more PCI devices (in the format >>>>>> + specified above) separated by semicolons. >>>>>> + Disable 10-Bit Tag Requester if the peer >>>>>> + device does not support the 10-Bit Tag >>>>>> + Completer.This will make P2P traffic safe. >>>>> >>>>> I can't imagine more awkward user experience than such kernel parameter. >>>>> >>>>> As a user, I will need to boot the system, hope for the best that system >>>>> works, write down all PCI device numbers, guess which one doesn't work >>>>> properly, update grub with new command line argument and reboot the >>>>> system. Any HW change and this dance should be repeated. >>>> >>>> There are already two such PCI parameters with this pattern and they are >>>> not that awkward. pci_dev may be specified with either vendor/device IDS >>>> or with a path of BDFs (which protects against renumbering). >>> >>> Unfortunately, in the real world, BDF is not so stable. It changes with >>> addition of new hardware, BIOS upgrades and even broken servers. >> >> That's why it supports using a *path* of BDFs which tends not to catch >> the wrong device if the topology changes. >> >>> Vendor/device IDs doesn't work if you have multiple devices of same >>> vendor in the system. >> >> Yes, but it's fine for some use cases. That's why there's a range of >> options. > > The thing is that you are adding PCI parameter that is applicable to everyone. > > We probably see different usage models for this feature. In my world, users > have thousands of servers that runs 24x7, with VMs on top, some of them perform > FW upgrades without stopping anything. The idea that you can reboot such server > any time, simply doesn't exist. > > So if I need to enable/disable this feature for one of the VFs, I will be stuck. > >> >>>> >>>> This flag is only useful in P2PDMA traffic, and if the user attempts >>>> such a transfer, it prints a warning (see the next patch) with the exact >>>> parameter that needs to be added to the command line. >>> >>> Dongdong citied PCI spec and it was very clear - don't enable this >>> feature unless you clearly know that it is safe to enable. This is >>> completely opposite to the proposal here - always enable and disable >>> if something is printed to the dmesg. >> >> Quoting from patch 4: >> >> "For platforms where the RC supports 10-Bit Tag Completer capability, >> it is highly recommended for platform firmware or operating software >> that configures PCIe hierarchies to Set the 10-Bit Tag Requester Enable >> bit automatically in Endpoints with 10-Bit Tag Requester capability. >> This enables the important class of 10-Bit Tag capable adapters that >> send Memory Read Requests only to host memory." >> >> Notice the last sentence. It's saying that devices who only talk to host >> memory should have 10-bit tags enabled. In the kernel we call devices >> that talk to things besides host memory "P2PDMA". So the spec is saying >> not to enable 10bit tags for devices participating in P2PDMA. The kernel >> needs a way to allow users to do that. The kernel parameter only stops >> the feature from being enabled for a specific device, and the only >> use-case is P2PDMA which is not that common and requires the user to be >> aware of their topology. So I really don't think this is that big a problem. > > I'm not question the feature and the need of configuration. My concern > is just *how* this feature is configured. > >> >>>> >>>> This has worked well for disable_acs_redir and was used for >>>> resource_alignment before that for quite some time. So save a better >>>> suggestion I think this is more than acceptable. >>> >>> I don't know about other parameters and their history, but we are not in >>> 90s anymore and addition of modules parameters (for the PCI it is kernel >>> cmdline arguments) are better to be changed to some configuration tool/sysfs. >> >> The problem was that the ACS bits had to be set before the kernel >> enumerated the devices. The IOMMU code simply was not able to support >> dynamic adjustments to its groups. I assume changing 10bit tags >> dynamically is similarly tricky -- but if it's not then, yes a sysfs >> interface in addition to the kernel parameter would be a good idea. > > I think that it is doable with combination of drivers_autoprobe disable > and some sysfs knob to enable/disable this feature before driver bind. > > It should be very similar to that we did for the dynamic MSI-X, see > /sys/bus/pci/devices/.../sriov_vf_msix_count Many thanks for your suggestion. Seems a sysfs could be work ok, but need to make sure 10-Bit Tag Requester to be set before binding the device driver as PCIe spec 5.0 section 7.5.3.16 Device Control 2 Register 10-Bit Tag Requester Enable says that If software changes the value of this bit while the Function has outstanding Non-Posted Requests, the result is undefined. Thanks, Dongdong > > Thanks > >> >> Logan > . >
On Tue, Jul 27, 2021 at 10:30:40PM +0800, Dongdong Liu wrote: > > > On 2021/7/27 19:05, Leon Romanovsky wrote: > > On Mon, Jul 26, 2021 at 09:48:57AM -0600, Logan Gunthorpe wrote: > > > > > > > > > On 2021-07-25 12:39 a.m., Leon Romanovsky wrote: > > > > On Fri, Jul 23, 2021 at 10:20:50AM -0600, Logan Gunthorpe wrote: > > > > > > > > > > > > > > > > > > > > On 2021-07-23 5:32 a.m., Leon Romanovsky wrote: > > > > > > On Fri, Jul 23, 2021 at 07:06:41PM +0800, Dongdong Liu wrote: > > > > > > > PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports > > > > > > > sending Requests to other Endpoints (as opposed to host memory), the > > > > > > > Endpoint must not send 10-Bit Tag Requests to another given Endpoint > > > > > > > unless an implementation-specific mechanism determines that the Endpoint > > > > > > > supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" > > > > > > > parameter to disable 10-Bit Tag Requester if the peer device does not > > > > > > > support the 10-Bit Tag Completer. This will make P2P traffic safe. > > > > > > > > > > > > > > Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> > > > > > > > --- > > > > > > > Documentation/admin-guide/kernel-parameters.txt | 7 ++++ > > > > > > > drivers/pci/pci.c | 56 +++++++++++++++++++++++++ > > > > > > > drivers/pci/pci.h | 1 + > > > > > > > drivers/pci/pcie/portdrv_pci.c | 13 +++--- > > > > > > > drivers/pci/probe.c | 9 ++-- > > > > > > > 5 files changed, 78 insertions(+), 8 deletions(-) > > > > > > > > > > > > > > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt > > > > > > > index bdb2200..c2c4585 100644 > > > > > > > --- a/Documentation/admin-guide/kernel-parameters.txt > > > > > > > +++ b/Documentation/admin-guide/kernel-parameters.txt > > > > > > > @@ -4019,6 +4019,13 @@ > > > > > > > bridges without forcing it upstream. Note: > > > > > > > this removes isolation between devices and > > > > > > > may put more devices in an IOMMU group. > > > > > > > + disable_10bit_tag=<pci_dev>[; ...] > > > > > > > + Specify one or more PCI devices (in the format > > > > > > > + specified above) separated by semicolons. > > > > > > > + Disable 10-Bit Tag Requester if the peer > > > > > > > + device does not support the 10-Bit Tag > > > > > > > + Completer.This will make P2P traffic safe. > > > > > > > > > > > > I can't imagine more awkward user experience than such kernel parameter. > > > > > > > > > > > > As a user, I will need to boot the system, hope for the best that system > > > > > > works, write down all PCI device numbers, guess which one doesn't work > > > > > > properly, update grub with new command line argument and reboot the > > > > > > system. Any HW change and this dance should be repeated. > > > > > > > > > > There are already two such PCI parameters with this pattern and they are > > > > > not that awkward. pci_dev may be specified with either vendor/device IDS > > > > > or with a path of BDFs (which protects against renumbering). > > > > > > > > Unfortunately, in the real world, BDF is not so stable. It changes with > > > > addition of new hardware, BIOS upgrades and even broken servers. > > > > > > That's why it supports using a *path* of BDFs which tends not to catch > > > the wrong device if the topology changes. > > > > > > > Vendor/device IDs doesn't work if you have multiple devices of same > > > > vendor in the system. > > > > > > Yes, but it's fine for some use cases. That's why there's a range of > > > options. > > > > The thing is that you are adding PCI parameter that is applicable to everyone. > > > > We probably see different usage models for this feature. In my world, users > > have thousands of servers that runs 24x7, with VMs on top, some of them perform > > FW upgrades without stopping anything. The idea that you can reboot such server > > any time, simply doesn't exist. > > > > So if I need to enable/disable this feature for one of the VFs, I will be stuck. > > > > > > > > > > > > > > > This flag is only useful in P2PDMA traffic, and if the user attempts > > > > > such a transfer, it prints a warning (see the next patch) with the exact > > > > > parameter that needs to be added to the command line. > > > > > > > > Dongdong citied PCI spec and it was very clear - don't enable this > > > > feature unless you clearly know that it is safe to enable. This is > > > > completely opposite to the proposal here - always enable and disable > > > > if something is printed to the dmesg. > > > > > > Quoting from patch 4: > > > > > > "For platforms where the RC supports 10-Bit Tag Completer capability, > > > it is highly recommended for platform firmware or operating software > > > that configures PCIe hierarchies to Set the 10-Bit Tag Requester Enable > > > bit automatically in Endpoints with 10-Bit Tag Requester capability. > > > This enables the important class of 10-Bit Tag capable adapters that > > > send Memory Read Requests only to host memory." > > > > > > Notice the last sentence. It's saying that devices who only talk to host > > > memory should have 10-bit tags enabled. In the kernel we call devices > > > that talk to things besides host memory "P2PDMA". So the spec is saying > > > not to enable 10bit tags for devices participating in P2PDMA. The kernel > > > needs a way to allow users to do that. The kernel parameter only stops > > > the feature from being enabled for a specific device, and the only > > > use-case is P2PDMA which is not that common and requires the user to be > > > aware of their topology. So I really don't think this is that big a problem. > > > > I'm not question the feature and the need of configuration. My concern > > is just *how* this feature is configured. > > > > > > > > > > > > > > > This has worked well for disable_acs_redir and was used for > > > > > resource_alignment before that for quite some time. So save a better > > > > > suggestion I think this is more than acceptable. > > > > > > > > I don't know about other parameters and their history, but we are not in > > > > 90s anymore and addition of modules parameters (for the PCI it is kernel > > > > cmdline arguments) are better to be changed to some configuration tool/sysfs. > > > > > > The problem was that the ACS bits had to be set before the kernel > > > enumerated the devices. The IOMMU code simply was not able to support > > > dynamic adjustments to its groups. I assume changing 10bit tags > > > dynamically is similarly tricky -- but if it's not then, yes a sysfs > > > interface in addition to the kernel parameter would be a good idea. > > > > I think that it is doable with combination of drivers_autoprobe disable > > and some sysfs knob to enable/disable this feature before driver bind. > > > > It should be very similar to that we did for the dynamic MSI-X, see > > /sys/bus/pci/devices/.../sriov_vf_msix_count > > Many thanks for your suggestion. > > Seems a sysfs could be work ok, but need to make sure 10-Bit Tag Requester > to be set before binding the device driver as > PCIe spec 5.0 section 7.5.3.16 Device Control 2 Register > 10-Bit Tag Requester Enable says that > If software changes the value of this bit while the Function > has outstanding Non-Posted Requests, the result is undefined. This is where drivers_autoprobe will help. Thanks > > Thanks, > Dongdong > > > > Thanks > > > > > > > > Logan > > . > >
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdb2200..c2c4585 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4019,6 +4019,13 @@ bridges without forcing it upstream. Note: this removes isolation between devices and may put more devices in an IOMMU group. + disable_10bit_tag=<pci_dev>[; ...] + Specify one or more PCI devices (in the format + specified above) separated by semicolons. + Disable 10-Bit Tag Requester if the peer + device does not support the 10-Bit Tag + Completer.This will make P2P traffic safe. + force_floating [S390] Force usage of floating interrupts. nomio [S390] Do not use MIO instructions. norid [S390] ignore the RID field and force use of diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index d14c573..8494e4f 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6568,6 +6568,59 @@ int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) } #endif +static const char *disable_10bit_tag_param; + +void pci_disable_10bit_tag(struct pci_dev *dev) +{ + int ret = 0; + const char *p; +#ifdef CONFIG_PCI_IOV + struct pci_sriov *iov; +#endif + + if (!disable_10bit_tag_param) + return; + + p = disable_10bit_tag_param; + while (*p) { + ret = pci_dev_str_match(dev, p, &p); + if (ret < 0) { + pr_info_once("PCI: Can't parse disable_10bit_tag parameter: %s\n", + disable_10bit_tag_param); + + break; + } else if (ret == 1) { + /* Found a match */ + break; + } + + if (*p != ';' && *p != ',') { + /* End of param or invalid format */ + break; + } + p++; + } + + if (ret != 1) + return; + +#ifdef CONFIG_PCI_IOV + if (dev->is_virtfn) { + iov = dev->physfn->sriov; + iov->ctrl &= ~PCI_SRIOV_CTRL_VF_10BIT_TAG_REQ_EN; + pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, + iov->ctrl); + pci_info(dev, "disabled PF SRIOV 10-Bit Tag Requester\n"); + return; +#endif + } + + pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN); + + pci_info(dev, "disabled 10-Bit Tag Requester\n"); +} + /** * pci_ext_cfg_avail - can we access extended PCI config space? * @@ -6643,6 +6696,8 @@ static int __init pci_setup(char *str) pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS); } else if (!strncmp(str, "disable_acs_redir=", 18)) { disable_acs_redir_param = str + 18; + } else if (!strncmp(str, "disable_10bit_tag=", 18)) { + disable_10bit_tag_param = str + 18; } else { pr_err("PCI: Unknown option `%s'\n", str); } @@ -6667,6 +6722,7 @@ static int __init pci_realloc_setup_params(void) resource_alignment_param = kstrdup(resource_alignment_param, GFP_KERNEL); disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL); + disable_10bit_tag_param = kstrdup(disable_10bit_tag_param, GFP_KERNEL); return 0; } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 93dcdd4..87c8187 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -16,6 +16,7 @@ extern bool pci_early_dump; bool pcie_cap_has_lnkctl(const struct pci_dev *dev); bool pcie_cap_has_rtctl(const struct pci_dev *dev); +void pci_disable_10bit_tag(struct pci_dev *dev); /* Functions internal to the PCI core code */ diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 2382cd2..747728e 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -125,15 +125,15 @@ static void pci_configure_rp_10bit_tag(struct pci_dev *dev) bool support = true; if (dev->subordinate == NULL) - return; + goto disable_10bit_tag_req; /* If no devices under the root port, no need to enable 10-Bit Tag. */ if (list_empty(&dev->subordinate->devices)) - return; + goto disable_10bit_tag_req; pci_10bit_tag_comp_support(dev, &support); if (!support) - return; + goto disable_10bit_tag_req; /* * PCIe spec 5.0r1.0 section 2.2.6.2 implementation note. @@ -146,14 +146,17 @@ static void pci_configure_rp_10bit_tag(struct pci_dev *dev) */ pci_walk_bus(dev->subordinate, pci_10bit_tag_comp_support, &support); if (!support) - return; + goto disable_10bit_tag_req; if (!(dev->pcie_devcap2 & PCI_EXP_DEVCAP2_10BIT_TAG_REQ)) - return; + goto disable_10bit_tag_req; pci_dbg(dev, "enabling 10-Bit Tag Requester\n"); pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN); + +disable_10bit_tag_req: + pci_disable_10bit_tag(dev); } /* diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 3da7baa..0b7b053 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2034,11 +2034,11 @@ static void pci_configure_10bit_tags(struct pci_dev *dev) struct pci_dev *bridge; if (!(dev->pcie_devcap2 & PCI_EXP_DEVCAP2_10BIT_TAG_COMP)) - return; + goto disable_10bit_tag_req; if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { dev->ext_10bit_tag = 1; - return; + goto disable_10bit_tag_req; } bridge = pci_upstream_bridge(dev); @@ -2050,7 +2050,7 @@ static void pci_configure_10bit_tags(struct pci_dev *dev) * for VF. */ if (dev->is_virtfn) - return; + goto disable_10bit_tag_req; if (pci_pcie_type(dev) == PCI_EXP_TYPE_ENDPOINT && dev->ext_10bit_tag == 1 && @@ -2059,6 +2059,9 @@ static void pci_configure_10bit_tags(struct pci_dev *dev) pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_10BIT_TAG_REQ_EN); } + +disable_10bit_tag_req: + pci_disable_10bit_tag(dev); } int pci_configure_extended_tags(struct pci_dev *dev, void *ign)
PCIe spec 5.0 r1.0 section 2.2.6.2 says that if an Endpoint supports sending Requests to other Endpoints (as opposed to host memory), the Endpoint must not send 10-Bit Tag Requests to another given Endpoint unless an implementation-specific mechanism determines that the Endpoint supports 10-Bit Tag Completer capability. Add "pci=disable_10bit_tag=" parameter to disable 10-Bit Tag Requester if the peer device does not support the 10-Bit Tag Completer. This will make P2P traffic safe. Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> --- Documentation/admin-guide/kernel-parameters.txt | 7 ++++ drivers/pci/pci.c | 56 +++++++++++++++++++++++++ drivers/pci/pci.h | 1 + drivers/pci/pcie/portdrv_pci.c | 13 +++--- drivers/pci/probe.c | 9 ++-- 5 files changed, 78 insertions(+), 8 deletions(-)