diff mbox series

[ANNOUNCE] v5.9-rc3-rt3

Message ID 20200902155557.h2wl2qpfn2rwsofw@linutronix.de
State New
Headers show
Series [ANNOUNCE] v5.9-rc3-rt3 | expand

Commit Message

Sebastian Andrzej Siewior Sept. 2, 2020, 3:55 p.m. UTC
Dear RT folks!

I'm pleased to announce the v5.9-rc3-rt3 patch set. 

Changes since v5.9-rc3-rt2:

  - Correct a compile issue in the i915 driver. Reported by Carsten Emde
    and Daniel Wagner.

  - Mark Marshall reported a crash on PowerPC. The reason for the crash
    is a race in exec_mmap() vs a context switch and is not limited to
    PowerPC. This race is present since v5.4.3-rt1 and is addressed in
    two changes:

    - commit 38cf307c1f201 ("mm: fix kthread_use_mm() vs TLB invalidate")
      which is part of v5.9-rc1.

    - patch "mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race"
      by Nicholas Piggin which has been posted for review and is not yet
      merged upstream.

Known issues
     - It has been pointed out that due to changes to the printk code the
       internal buffer representation changed. This is only an issue if tools
       like `crash' are used to extract the printk buffer from a kernel memory
       image.

The delta patch against v5.9-rc3-rt2 is appended below and can be found here:
 
     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/incr/patch-5.9-rc3-rt2-rt3.patch.xz

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.9-rc3-rt3

The RT patch against v5.9-rc3 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patch-5.9-rc3-rt3.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patches-5.9-rc3-rt3.tar.xz

Sebastian

Comments

Mike Galbraith Sept. 5, 2020, 4:47 a.m. UTC | #1
[   22.004225] r8169 0000:03:00.0 eth0: Link is Up - 1Gbps/Full - flow control off
[   22.004450] br0: port 1(eth0) entered blocking state
[   22.004473] br0: port 1(eth0) entered forwarding state
[   22.006411] IPv6: ADDRCONF(NETDEV_CHANGE): br0: link becomes ready

[   22.024936] ======================================================
[   22.024936] WARNING: possible circular locking dependency detected
[   22.024937] 5.9.0.gc70672d-rt3-rt #8 Tainted: G            E
[   22.024938] ------------------------------------------------------
[   22.024939] ksoftirqd/0/10 is trying to acquire lock:
[   22.024941] ffff983475521278 (&sch->q.lock){+...}-{0:0}, at: sch_direct_xmit+0x81/0x2f0
[   22.024947]
               but task is already holding lock:
[   22.024947] ffff9834755212b8 (&s->seqcount#9){+...}-{0:0}, at: br_dev_queue_push_xmit+0x7d/0x180 [bridge]
[   22.024959]
               which lock already depends on the new lock.

[   22.024960]
               the existing dependency chain (in reverse order) is:
[   22.024961]
               -> #1 (&s->seqcount#9){+...}-{0:0}:
[   22.024963]        lock_acquire+0x92/0x3f0
[   22.024967]        __dev_queue_xmit+0xce7/0xe30
[   22.024969]        br_dev_queue_push_xmit+0x7d/0x180 [bridge]
[   22.024974]        br_forward_finish+0x10a/0x1b0 [bridge]
[   22.024980]        __br_forward+0x17d/0x300 [bridge]
[   22.024984]        br_dev_xmit+0x442/0x570 [bridge]
[   22.024990]        dev_hard_start_xmit+0xc5/0x3f0
[   22.024992]        __dev_queue_xmit+0x9db/0xe30
[   22.024993]        ip6_finish_output2+0x26a/0x990
[   22.024995]        ip6_output+0x6d/0x260
[   22.024996]        mld_sendpack+0x1d9/0x360
[   22.024999]        mld_ifc_timer_expire+0x1f7/0x370
[   22.025000]        call_timer_fn+0xa0/0x390
[   22.025003]        run_timer_softirq+0x59a/0x720
[   22.025004]        __do_softirq+0xc1/0x5b2
[   22.025006]        run_ksoftirqd+0x47/0x70
[   22.025007]        smpboot_thread_fn+0x266/0x320
[   22.025009]        kthread+0x171/0x190
[   22.025010]        ret_from_fork+0x1f/0x30
[   22.025013]
               -> #0 (&sch->q.lock){+...}-{0:0}:
[   22.025015]        validate_chain+0xa81/0x1230
[   22.025016]        __lock_acquire+0x880/0xbf0
[   22.025017]        lock_acquire+0x92/0x3f0
[   22.025018]        rt_spin_lock+0x78/0xd0
[   22.025020]        sch_direct_xmit+0x81/0x2f0
[   22.025022]        __dev_queue_xmit+0xd38/0xe30
[   22.025023]        br_dev_queue_push_xmit+0x7d/0x180 [bridge]
[   22.025029]        br_forward_finish+0x10a/0x1b0 [bridge]
[   22.025033]        __br_forward+0x17d/0x300 [bridge]
[   22.025039]        br_dev_xmit+0x442/0x570 [bridge]
[   22.025043]        dev_hard_start_xmit+0xc5/0x3f0
[   22.025044]        __dev_queue_xmit+0x9db/0xe30
[   22.025046]        ip6_finish_output2+0x26a/0x990
[   22.025047]        ip6_output+0x6d/0x260
[   22.025049]        mld_sendpack+0x1d9/0x360
[   22.025050]        mld_ifc_timer_expire+0x1f7/0x370
[   22.025052]        call_timer_fn+0xa0/0x390
[   22.025053]        run_timer_softirq+0x59a/0x720
[   22.025054]        __do_softirq+0xc1/0x5b2
[   22.025055]        run_ksoftirqd+0x47/0x70
[   22.025056]        smpboot_thread_fn+0x266/0x320
[   22.025058]        kthread+0x171/0x190
[   22.025059]        ret_from_fork+0x1f/0x30
[   22.025060]
               other info that might help us debug this:

[   22.025061]  Possible unsafe locking scenario:

[   22.025061]        CPU0                    CPU1
[   22.025061]        ----                    ----
[   22.025062]   lock(&s->seqcount#9);
[   22.025064]                                lock(&sch->q.lock);
[   22.025065]                                lock(&s->seqcount#9);
[   22.025065]   lock(&sch->q.lock);
[   22.025066]
                *** DEADLOCK ***

[   22.025066] 20 locks held by ksoftirqd/0/10:
[   22.025067]  #0: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x5/0xd0
[   22.025071]  #1: ffff98351ec1a6d0 (per_cpu_ptr(&bh_lock.l.lock, cpu)){....}-{3:3}, at: __local_bh_disable_ip+0xbf/0x230
[   22.025074]  #2: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0xfb/0x230
[   22.025077]  #3: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x5/0xd0
[   22.025080]  #4: ffff98351ec1b338 (&base->expiry_lock){+...}-{0:0}, at: run_timer_softirq+0x3e6/0x720
[   22.025083]  #5: ffffb32e8007bd68 ((&idev->mc_ifc_timer)){+...}-{0:0}, at: call_timer_fn+0x5/0x390
[   22.025086]  #6: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: mld_sendpack+0x5/0x360
[   22.025090]  #7: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0xfb/0x230
[   22.025093]  #8: ffffffff9a4c7100 (rcu_read_lock_bh){....}-{1:3}, at: ip6_finish_output2+0x73/0x990
[   22.025096]  #9: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0xfb/0x230
[   22.025097]  #10: ffffffff9a4c7100 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x63/0xe30
[   22.025100]  #11: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: br_dev_xmit+0x5/0x570 [bridge]
[   22.025108]  #12: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0xfb/0x230
[   22.025110]  #13: ffffffff9a4c7100 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x63/0xe30
[   22.025113]  #14: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x5/0xd0
[   22.025116]  #15: ffff9834755215f0 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{0:0}, at: __dev_queue_xmit+0x8a4/0xe30
[   22.025119]  #16: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x5/0xd0
[   22.025121]  #17: ffff983475521398 (dev->qdisc_running_key ?: &qdisc_running_key){+...}-{0:0}, at: __dev_queue_xmit+0xca6/0xe30
[   22.025124]  #18: ffff9834755212b8 (&s->seqcount#9){+...}-{0:0}, at: br_dev_queue_push_xmit+0x7d/0x180 [bridge]
[   22.025132]  #19: ffffffff9a4c7140 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x5/0xd0
[   22.025134]
               stack backtrace:
[   22.025134] CPU: 0 PID: 10 Comm: ksoftirqd/0 Kdump: loaded Tainted: G            E     5.9.0.gc70672d-rt3-rt #8
[   22.025135] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[   22.025136] Call Trace:
[   22.025138]  dump_stack+0x77/0x9b
[   22.025143]  check_noncircular+0x148/0x160
[   22.025147]  ? validate_chain+0xa81/0x1230
[   22.025148]  validate_chain+0xa81/0x1230
[   22.025153]  __lock_acquire+0x880/0xbf0
[   22.025157]  lock_acquire+0x92/0x3f0
[   22.025158]  ? sch_direct_xmit+0x81/0x2f0
[   22.025160]  ? rt_spin_unlock+0x39/0x90
[   22.025162]  rt_spin_lock+0x78/0xd0
[   22.025164]  ? sch_direct_xmit+0x81/0x2f0
[   22.025166]  sch_direct_xmit+0x81/0x2f0
[   22.025169]  __dev_queue_xmit+0xd38/0xe30
[   22.025173]  ? find_held_lock+0x2d/0x90
[   22.025176]  ? br_dev_queue_push_xmit+0x7d/0x180 [bridge]
[   22.025182]  br_dev_queue_push_xmit+0x7d/0x180 [bridge]
[   22.025190]  br_forward_finish+0x10a/0x1b0 [bridge]
[   22.025196]  ? __br_forward+0x151/0x300 [bridge]
[   22.025204]  __br_forward+0x17d/0x300 [bridge]
[   22.025211]  ? br_flood+0x98/0x120 [bridge]
[   22.025216]  br_dev_xmit+0x442/0x570 [bridge]
[   22.025224]  dev_hard_start_xmit+0xc5/0x3f0
[   22.025226]  ? netif_skb_features+0xb0/0x230
[   22.025228]  __dev_queue_xmit+0x9db/0xe30
[   22.025231]  ? eth_header+0x25/0xc0
[   22.025235]  ? ip6_finish_output2+0x26a/0x990
[   22.025236]  ip6_finish_output2+0x26a/0x990
[   22.025239]  ? ip6_mtu+0x135/0x1b0
[   22.025241]  ? ip6_output+0x6d/0x260
[   22.025243]  ip6_output+0x6d/0x260
[   22.025246]  ? __ip6_finish_output+0x210/0x210
[   22.025249]  mld_sendpack+0x1d9/0x360
[   22.025252]  ? mld_ifc_timer_expire+0x119/0x370
[   22.025254]  mld_ifc_timer_expire+0x1f7/0x370
[   22.025256]  ? mld_dad_timer_expire+0xb0/0xb0
[   22.025258]  ? mld_dad_timer_expire+0xb0/0xb0
[   22.025260]  call_timer_fn+0xa0/0x390
[   22.025263]  ? mld_dad_timer_expire+0xb0/0xb0
[   22.025264]  run_timer_softirq+0x59a/0x720
[   22.025268]  ? lock_acquire+0x92/0x3f0
[   22.025272]  __do_softirq+0xc1/0x5b2
[   22.025274]  ? smpboot_thread_fn+0x28/0x320
[   22.025276]  ? smpboot_thread_fn+0x28/0x320
[   22.025278]  ? smpboot_thread_fn+0x70/0x320
[   22.025279]  run_ksoftirqd+0x47/0x70
[   22.025281]  smpboot_thread_fn+0x266/0x320
[   22.025284]  ? smpboot_register_percpu_thread+0xe0/0xe0
[   22.025286]  kthread+0x171/0x190
[   22.025287]  ? kthread_park+0x90/0x90
[   22.025288]  ret_from_fork+0x1f/0x30
[   22.176416] NET: Registered protocol family 17
Sebastian Andrzej Siewior Sept. 8, 2020, 3:12 p.m. UTC | #2
On 2020-09-05 07:19:10 [+0200], Mike Galbraith wrote:
> Lappy, which does not use bridge, boots clean... but lock leakage
> pretty darn quickly inspires lockdep to craps its drawers.
> 
> [  209.001111] BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!
> [  209.001113] turning off the locking correctness validator.
> [  209.001114] CPU: 2 PID: 3773 Comm: Socket Thread Tainted: G S        I E     5.9.0.gc70672d-rt3-rt #8
> [  209.001117] Hardware name: HP HP Spectre x360 Convertible/804F, BIOS F.47 11/22/2017
> [  209.001118] Call Trace:
> [  209.001123]  dump_stack+0x77/0x9b
> [  209.001129]  validate_chain+0xf60/0x1230

I have no idea how to debug this based on this report. Can you narrow
it down to something?

Is Lappy new, got a new something or has a new config switch? I'm just
curious if this something or something that was always there but
remained undetected.
(Your other report was about something that was previously always "broken".)

Sebastian
Mike Galbraith Sept. 8, 2020, 3:59 p.m. UTC | #3
On Tue, 2020-09-08 at 17:12 +0200, Sebastian Andrzej Siewior wrote:
> On 2020-09-05 07:19:10 [+0200], Mike Galbraith wrote:
> > Lappy, which does not use bridge, boots clean... but lock leakage
> > pretty darn quickly inspires lockdep to craps its drawers.
> >
> > [  209.001111] BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!
> > [  209.001113] turning off the locking correctness validator.
> > [  209.001114] CPU: 2 PID: 3773 Comm: Socket Thread Tainted: G S        I E     5.9.0.gc70672d-rt3-rt #8
> > [  209.001117] Hardware name: HP HP Spectre x360 Convertible/804F, BIOS F.47 11/22/2017
> > [  209.001118] Call Trace:
> > [  209.001123]  dump_stack+0x77/0x9b
> > [  209.001129]  validate_chain+0xf60/0x1230
>
> I have no idea how to debug this based on this report. Can you narrow
> it down to something?

I instrumented what I presume is still this problem once upon a time,
structures containing locks are allocated/initialized/freed again and
again with no cleanup until we increment into the wall.

> Is Lappy new, got a new something or has a new config switch? I'm just
> curious if this something or something that was always there but
> remained undetected.

Nah, this is nothing new.  Turn lockdep on in RT, it's just a matter of
time before it turns itself off.  It's usually just not _that_ quick.

	-Mike
Mike Galbraith Sept. 8, 2020, 4:19 p.m. UTC | #4
On Tue, 2020-09-08 at 17:06 +0200, Sebastian Andrzej Siewior wrote:
> On 2020-09-08 16:56:20 [+0200], Mike Galbraith wrote:
> > On Tue, 2020-09-08 at 14:19 +0200, Sebastian Andrzej Siewior wrote:
> > >
> > > This has nothing to do with the bridge but with the fact that you use a
> > > non standard queue class (something else than pfifo_fast).
> >
> > That must be SUSE, I don't muck about in network land.  I downloaded a
> > whole library of RFCs decades ago, but turns out that one of those is
> > all the bedtime story you'll ever need.  Huge waste of bandwidth :)
>
> I see.
> This should cure it:

I'll give that a go.

	-Mike
Mike Galbraith Sept. 9, 2020, 3:12 a.m. UTC | #5
On Wed, 2020-09-02 at 17:55 +0200, Sebastian Andrzej Siewior wrote:
>
> Known issues
>      - It has been pointed out that due to changes to the printk code the
>        internal buffer representation changed. This is only an issue if tools
>        like `crash' are used to extract the printk buffer from a kernel memory
>        image.

Ouch.  While installing -rt5 on lappy via nfs, -rt5 server box exploded
leaving nada in logs.  I have a nifty crash dump of the event, but...

	-Mike
Mike Galbraith Sept. 9, 2020, 5:45 a.m. UTC | #6
On Wed, 2020-09-09 at 05:12 +0200, Mike Galbraith wrote:
> On Wed, 2020-09-02 at 17:55 +0200, Sebastian Andrzej Siewior wrote:
> >
> > Known issues
> >      - It has been pointed out that due to changes to the printk code the
> >        internal buffer representation changed. This is only an issue if tools
> >        like `crash' are used to extract the printk buffer from a kernel memory
> >        image.
>
> Ouch.  While installing -rt5 on lappy via nfs, -rt5 server box exploded
> leaving nada in logs.  I have a nifty crash dump of the event, but...

After convincing crash (with club) that it didn't _really_ need a
log_buf, nfs had nothing to do with the crash, it was nouveau.

      KERNEL: vmlinux-5.9.0.gf4d51df-rt5-rt.gz
    DUMPFILE: vmcore
        CPUS: 8
        DATE: Wed Sep  9 04:41:24 2020
      UPTIME: 00:08:10
LOAD AVERAGE: 3.17, 1.86, 0.99
       TASKS: 715
    NODENAME: homer
     RELEASE: 5.9.0.gf4d51df-rt5-rt
     VERSION: #1 SMP PREEMPT_RT Wed Sep 9 03:22:01 CEST 2020
     MACHINE: x86_64  (3591 Mhz)
      MEMORY: 16 GB
       PANIC: ""
         PID: 2146
     COMMAND: "X"
        TASK: ffff994c7fad0000  [THREAD_INFO: ffff994c7fad0000]
         CPU: 0
       STATE: TASK_RUNNING (PANIC)

crash> bt -l
PID: 2146   TASK: ffff994c7fad0000  CPU: 0   COMMAND: "X"
 #0 [ffffbfffc11a76c8] machine_kexec at ffffffffb7064879
    /backup/usr/local/src/kernel/linux-master-rt/./include/linux/ftrace.h: 792
 #1 [ffffbfffc11a7710] __crash_kexec at ffffffffb7173622
    /backup/usr/local/src/kernel/linux-master-rt/kernel/kexec_core.c: 963
 #2 [ffffbfffc11a77d0] crash_kexec at ffffffffb7174920
    /backup/usr/local/src/kernel/linux-master-rt/./arch/x86/include/asm/atomic.h: 41
 #3 [ffffbfffc11a77e0] oops_end at ffffffffb702716f
    /backup/usr/local/src/kernel/linux-master-rt/arch/x86/kernel/dumpstack.c: 342
 #4 [ffffbfffc11a7800] exc_general_protection at ffffffffb79a2fc6
    /backup/usr/local/src/kernel/linux-master-rt/arch/x86/kernel/traps.c: 82
 #5 [ffffbfffc11a7890] asm_exc_general_protection at ffffffffb7a00a1e
    /backup/usr/local/src/kernel/linux-master-rt/./arch/x86/include/asm/idtentry.h: 532
 #6 [ffffbfffc11a78a0] nvif_object_ctor at ffffffffc07ee6a7 [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nvif/object.c: 280
 #7 [ffffbfffc11a7918] __kmalloc at ffffffffb72eea12
    /backup/usr/local/src/kernel/linux-master-rt/mm/slub.c: 261
 #8 [ffffbfffc11a7980] nvif_object_ctor at ffffffffc07ee6a7 [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nvif/object.c: 280
 #9 [ffffbfffc11a79d0] nvif_mem_ctor_type at ffffffffc07eef48 [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nvif/mem.c: 74
#10 [ffffbfffc11a7aa8] nouveau_mem_vram at ffffffffc08b5291 [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nouveau_mem.c: 155
#11 [ffffbfffc11a7b10] nouveau_vram_manager_new at ffffffffc08b594d [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nouveau_ttm.c: 76
#12 [ffffbfffc11a7b30] ttm_bo_mem_space at ffffffffc05af2ac [ttm]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/ttm/ttm_bo.c: 1065
#13 [ffffbfffc11a7b88] ttm_bo_validate at ffffffffc05afaca [ttm]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/ttm/ttm_bo.c: 1137
#14 [ffffbfffc11a7c18] ttm_bo_init_reserved at ffffffffc05afe70 [ttm]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/ttm/ttm_bo.c: 1330
#15 [ffffbfffc11a7c60] ttm_bo_init at ffffffffc05afff7 [ttm]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/ttm/ttm_bo.c: 1364
#16 [ffffbfffc11a7cc8] nouveau_bo_init at ffffffffc08b0f7b [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nouveau_bo.c: 317
#17 [ffffbfffc11a7d38] nouveau_gem_new at ffffffffc08b2f7b [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nouveau_gem.c: 206
#18 [ffffbfffc11a7d70] nouveau_gem_ioctl_new at ffffffffc08b3001 [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nouveau_gem.c: 272
#19 [ffffbfffc11a7da0] drm_ioctl_kernel at ffffffffc066f564 [drm]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/drm_ioctl.c: 793
#20 [ffffbfffc11a7de0] drm_ioctl at ffffffffc066f88e [drm]
    /backup/usr/local/src/kernel/linux-master-rt/./include/linux/uaccess.h: 168
#21 [ffffbfffc11a7ed0] nouveau_drm_ioctl at ffffffffc08abf56 [nouveau]
    /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nouveau_drm.c: 1163
#22 [ffffbfffc11a7f08] __x64_sys_ioctl at ffffffffb733255e
    /backup/usr/local/src/kernel/linux-master-rt/fs/ioctl.c: 49
#23 [ffffbfffc11a7f40] do_syscall_64 at ffffffffb79a25c3
    /backup/usr/local/src/kernel/linux-master-rt/arch/x86/entry/common.c: 46
#24 [ffffbfffc11a7f50] entry_SYSCALL_64_after_hwframe at ffffffffb7a0008c
    /backup/usr/local/src/kernel/linux-master-rt/arch/x86/entry/entry_64.S: 125
    RIP: 00007f96707a6ac7  RSP: 00007ffc1cbc2998  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000055743cf152e0  RCX: 00007f96707a6ac7
    RDX: 00007ffc1cbc29f0  RSI: 00000000c0306480  RDI: 000000000000000e
    RBP: 00007ffc1cbc29f0   R8: 0000000000000000   R9: 0000000000000003
    R10: fffffffffffffd98  R11: 0000000000000246  R12: 00000000c0306480
    R13: 000000000000000e  R14: 000055743ce99040  R15: 000055743c60cfd0
    ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b
Sebastian Andrzej Siewior Sept. 9, 2020, 8:20 a.m. UTC | #7
On 2020-09-09 07:45:22 [+0200], Mike Galbraith wrote:
> On Wed, 2020-09-09 at 05:12 +0200, Mike Galbraith wrote:
> > On Wed, 2020-09-02 at 17:55 +0200, Sebastian Andrzej Siewior wrote:
> > >
> > > Known issues
> > >      - It has been pointed out that due to changes to the printk code the
> > >        internal buffer representation changed. This is only an issue if tools
> > >        like `crash' are used to extract the printk buffer from a kernel memory
> > >        image.
> >
> > Ouch.  While installing -rt5 on lappy via nfs, -rt5 server box exploded
> > leaving nada in logs.  I have a nifty crash dump of the event, but...
> 
> After convincing crash (with club) that it didn't _really_ need a
> log_buf, nfs had nothing to do with the crash, it was nouveau.

okay. Line 280 is hard to understand. My guess is that we got a pointer
and then the boom occurred but I can't tell why/how. A few lines later
there is args->x = y…
Do you see the lockdep splat without nouveau?

> crash> bt -l
> PID: 2146   TASK: ffff994c7fad0000  CPU: 0   COMMAND: "X"
>  #0 [ffffbfffc11a76c8] machine_kexec at ffffffffb7064879
>     /backup/usr/local/src/kernel/linux-master-rt/./include/linux/ftrace.h: 792
>  #1 [ffffbfffc11a7710] __crash_kexec at ffffffffb7173622
>     /backup/usr/local/src/kernel/linux-master-rt/kernel/kexec_core.c: 963
>  #2 [ffffbfffc11a77d0] crash_kexec at ffffffffb7174920
>     /backup/usr/local/src/kernel/linux-master-rt/./arch/x86/include/asm/atomic.h: 41
>  #3 [ffffbfffc11a77e0] oops_end at ffffffffb702716f
>     /backup/usr/local/src/kernel/linux-master-rt/arch/x86/kernel/dumpstack.c: 342
>  #4 [ffffbfffc11a7800] exc_general_protection at ffffffffb79a2fc6
>     /backup/usr/local/src/kernel/linux-master-rt/arch/x86/kernel/traps.c: 82
>  #5 [ffffbfffc11a7890] asm_exc_general_protection at ffffffffb7a00a1e
>     /backup/usr/local/src/kernel/linux-master-rt/./arch/x86/include/asm/idtentry.h: 532
>  #6 [ffffbfffc11a78a0] nvif_object_ctor at ffffffffc07ee6a7 [nouveau]
>     /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nvif/object.c: 280
>  #7 [ffffbfffc11a7918] __kmalloc at ffffffffb72eea12
>     /backup/usr/local/src/kernel/linux-master-rt/mm/slub.c: 261
>  #8 [ffffbfffc11a7980] nvif_object_ctor at ffffffffc07ee6a7 [nouveau]
>     /backup/usr/local/src/kernel/linux-master-rt/drivers/gpu/drm/nouveau/nvif/object.c: 280

Sebastian
Sebastian Andrzej Siewior Sept. 9, 2020, 8:59 a.m. UTC | #8
On 2020-09-09 10:56:41 [+0200], Mike Galbraith wrote:
> On Wed, 2020-09-09 at 10:20 +0200, Sebastian Andrzej Siewior wrote:
> >
> > Do you see the lockdep splat without nouveau?
> 
> Yeah.  Lappy uses i915, but lockdep also shuts itself off.

You sent the config, I will try to throw it later on kvm and actual
hardware and see what happens.

> BTW, methinks RT had nothing to do with the nouveau burp.

that is good to hear :)

> 	-Mike

Sebastian
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 222e553f3cf50..5c8e173dc7c2b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -415,6 +415,13 @@  config MMU_GATHER_NO_GATHER
 	bool
 	depends on MMU_GATHER_TABLE_FREE
 
+config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+	bool
+	help
+	  Temporary select until all architectures can be converted to have
+	  irqs disabled over activate_mm. Architectures that do IPI based TLB
+	  shootdowns should enable this.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h
index c5700f44422ec..e8f809161c75f 100644
--- a/drivers/gpu/drm/i915/display/intel_display_types.h
+++ b/drivers/gpu/drm/i915/display/intel_display_types.h
@@ -29,7 +29,6 @@ 
 #include <linux/async.h>
 #include <linux/i2c.h>
 #include <linux/sched/clock.h>
-#include <linux/local_lock.h>
 
 #include <drm/drm_atomic.h>
 #include <drm/drm_crtc.h>
@@ -1150,7 +1149,6 @@  struct intel_crtc {
 #ifdef CONFIG_DEBUG_FS
 	struct intel_pipe_crc pipe_crc;
 #endif
-	local_lock_t pipe_update_lock;
 };
 
 struct intel_plane {
diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c
index 62b8248d2ee79..1b9d5e690a9f0 100644
--- a/drivers/gpu/drm/i915/display/intel_sprite.c
+++ b/drivers/gpu/drm/i915/display/intel_sprite.c
@@ -118,7 +118,8 @@  void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state)
 			"PSR idle timed out 0x%x, atomic update may fail\n",
 			psr_status);
 
-	local_lock_irq(&crtc->pipe_update_lock);
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_irq_disable();
 
 	crtc->debug.min_vbl = min;
 	crtc->debug.max_vbl = max;
@@ -143,11 +144,13 @@  void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state)
 			break;
 		}
 
-		local_unlock_irq(&crtc->pipe_update_lock);
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_enable();
 
 		timeout = schedule_timeout(timeout);
 
-		local_lock_irq(&crtc->pipe_update_lock);
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_disable();
 	}
 
 	finish_wait(wq, &wait);
@@ -180,7 +183,8 @@  void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state)
 	return;
 
 irq_disable:
-	local_lock_irq(&crtc->pipe_update_lock);
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_irq_disable();
 }
 
 /**
@@ -218,7 +222,8 @@  void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
 		new_crtc_state->uapi.event = NULL;
 	}
 
-	local_unlock_irq(&crtc->pipe_update_lock);
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_irq_enable();
 
 	if (intel_vgpu_active(dev_priv))
 		return;
diff --git a/fs/exec.c b/fs/exec.c
index a91003e28eaae..d4fb18baf1fb1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1130,11 +1130,24 @@  static int exec_mmap(struct mm_struct *mm)
 	}
 
 	task_lock(tsk);
-	active_mm = tsk->active_mm;
 	membarrier_exec_mmap(mm);
-	tsk->mm = mm;
+
+	local_irq_disable();
+	active_mm = tsk->active_mm;
 	tsk->active_mm = mm;
+	tsk->mm = mm;
+	/*
+	 * This prevents preemption while active_mm is being loaded and
+	 * it and mm are being updated, which could cause problems for
+	 * lazy tlb mm refcounting when these are updated by context
+	 * switches. Not all architectures can handle irqs off over
+	 * activate_mm yet.
+	 */
+	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+		local_irq_enable();
 	activate_mm(active_mm, mm);
+	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+		local_irq_enable();
 	tsk->mm->vmacache_seqnum = 0;
 	vmacache_flush(tsk);
 	task_unlock(tsk);
diff --git a/localversion-rt b/localversion-rt
index c3054d08a1129..1445cd65885cd 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@ 
--rt2
+-rt3