NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- To: Konrad Schroder <perseant%hhhh.org@localhost>
- Subject: Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- From: Manuel Bouyer <bouyer%antioche.eu.org@localhost>
- Date: Tue, 13 Jan 2026 15:50:10 +0100
On Tue, Jan 13, 2026 at 03:23:14PM +0100, Manuel Bouyer wrote:
> On Mon, Jan 12, 2026 at 11:20:34PM +0100, Manuel Bouyer wrote:
> > On Sun, Jan 11, 2026 at 08:35:10PM -0800, Konrad Schroder wrote:
> > > On 1/10/2026 2:33 PM, Manuel Bouyer wrote:
> > > > And this assembly work may have allowed me to find the problem.
> > > > Basically, the CLI and STI macros are not atomic on Xen PV, and if preemption
> > > > happens at a bad time we may end up updating the upcall_mask of our previous
> > > > CPU. The attached patch should close this race condition; hopefully it's
> > > > the last one.
> > >
> > > I can confirm that, at least in my case, this allows me to build
> > > continuously all day without a crash.
> >
> > Good, it points that we're on the right track.
> > Can you try the attached patch instead ? It tries to limit the extra
> > instructions to only places where it's needed, and add DIAGNOSTIC code
> > to check it's really not needed in other places.
>
> Sorry this patch has a bug; working on an updated version
> stay tuned.
Here's the updated patch
--
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
NetBSD: 26 ans d'experience feront toujours la difference
--
Index: amd64/amd64/amd64_trap.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/amd64_trap.S,v
retrieving revision 1.55
diff -u -p -u -r1.55 amd64_trap.S
--- amd64/amd64/amd64_trap.S 27 Feb 2023 16:24:28 -0000 1.55
+++ amd64/amd64/amd64_trap.S 13 Jan 2026 14:48:25 -0000
@@ -90,7 +90,7 @@
*/
#ifdef XENPV
-#define PRE_TRAP CLI(cx); movq (%rsp),%rcx ; movq 8(%rsp),%r11 ; addq $0x10,%rsp
+#define PRE_TRAP CLI2(cx,r11); movq (%rsp),%rcx ; movq 8(%rsp),%r11 ; addq $0x10,%rsp
#else
#define PRE_TRAP
#endif
@@ -671,7 +671,7 @@ calltrap:
.Lalltraps_checkast:
movq CPUVAR(CURLWP),%r14
/* Check for ASTs on exit to user mode. */
- CLI(si)
+ CLI2(si, rdi)
CHECK_ASTPENDING(%r14)
je 3f
CLEAR_ASTPENDING(%r14)
Index: amd64/amd64/cpufunc.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/cpufunc.S,v
retrieving revision 1.70
diff -u -p -u -r1.70 cpufunc.S
--- amd64/amd64/cpufunc.S 6 Sep 2025 02:53:22 -0000 1.70
+++ amd64/amd64/cpufunc.S 13 Jan 2026 14:48:25 -0000
@@ -184,7 +184,7 @@ ENTRY(wbinvd)
END(wbinvd)
ENTRY(setusergs)
- CLI(ax)
+ CLI2(ax, rdi)
swapgs
movw %di, %gs
swapgs
Index: amd64/amd64/locore.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/locore.S,v
retrieving revision 1.235
diff -u -p -u -r1.235 locore.S
--- amd64/amd64/locore.S 31 Dec 2025 15:33:50 -0000 1.235
+++ amd64/amd64/locore.S 13 Jan 2026 14:48:25 -0000
@@ -1641,7 +1641,7 @@ ENTRY(handle_syscall)
* to ensure we don't take an interrupt with some of the user
* registers loaded.
*/
- CLI(si)
+ CLI2(si, rax)
/* Check for ASTs on exit to user mode. */
movl L_MD_ASTPENDING(%r14),%eax
orl CPUVAR(WANT_PMAPLOAD),%eax
@@ -1768,7 +1768,9 @@ IDTVEC(\name)
* But it didn't disable events
*/
pushq %rsi
- CLI(si)
+ pushq %rdi
+ CLI2(si, rdi)
+ popq %rdi
popq %rsi
addq $0x10,%rsp /* gap to match cs:rip */
pushq $2 /* error code */
@@ -1813,7 +1815,9 @@ IDTVEC_END(syscall32)
IDTVEC(osyscall)
#ifdef XENPV
pushq %rsi
- CLI(si)
+ pushq %rdi
+ CLI2(si, rdi)
+ popq %rdi
popq %rsi
movq (%rsp),%rcx
movq 8(%rsp),%r11
Index: amd64/amd64/spl.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/spl.S,v
retrieving revision 1.49
diff -u -p -u -r1.49 spl.S
--- amd64/amd64/spl.S 1 Mar 2023 08:38:50 -0000 1.49
+++ amd64/amd64/spl.S 13 Jan 2026 14:48:25 -0000
@@ -313,6 +313,7 @@ END(spllower)
* a lower-prio one first, which needs to take the kernel lock -->
* the sending CPU will never see the that CPU accept the IPI
* (see pmap_tlb_shootnow).
+ * XenPV note: it seems that we can be called at spl0, so need to use CLI2()
*/
IDTVEC(spllower)
pushq %rbx
@@ -323,7 +324,7 @@ IDTVEC(spllower)
1:
movl %ebx,%eax /* get cpl */
movq CPUVAR(IUNMASK)(,%rax,8),%rax
- CLI(si)
+ CLI2(si, r12)
andq CPUVAR(IPENDING),%rax /* any non-masked bits left? */
jz 2f
bsrq %rax,%rax
@@ -355,7 +356,7 @@ IDTVEC(doreti)
1:
movl %ebx,%eax
movq CPUVAR(IUNMASK)(,%rax,8),%rax
- CLI(si)
+ CLI2(si, r14)
andq CPUVAR(IPENDING),%rax
jz 2f
bsrq %rax,%rax /* slow, but not worth optimizing */
@@ -378,7 +379,7 @@ IDTVEC(doreti)
movq %rsp,%rdi
KMSAN_INIT_ARG(8)
call _C_LABEL(trap)
- CLI(si)
+ CLI2(si, rdi)
jmp .Ldoreti_checkast
3:
CHECK_DEFERRED_SWITCH
@@ -389,6 +390,6 @@ IDTVEC(doreti)
9:
STI(si)
call _C_LABEL(do_pmap_load)
- CLI(si)
+ CLI2(si, rdi)
jmp .Ldoreti_checkast /* recheck ASTs */
IDTVEC_END(doreti)
Index: amd64/include/frameasm.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/frameasm.h,v
retrieving revision 1.55
diff -u -p -u -r1.55 frameasm.h
--- amd64/include/frameasm.h 30 Jul 2022 14:11:00 -0000 1.55
+++ amd64/include/frameasm.h 13 Jan 2026 14:48:25 -0000
@@ -23,31 +23,68 @@
#define XEN_ONLY2(x,y) x,y
#define NOT_XEN(x)
+#ifdef DIAGNOSTIC
+/*
+ * acessing EVTCHN_UPCALL_MASK is save only if preemption is disabled, i.e.:
+ * l_nopreempt is not 0, or
+ * ci_ilevel is nit 0, or
+ * EVTCHN_UPCALL_MASK is not 0
+ * ci_idepth is not negative
+ */
#define CLI(temp_reg) \
- movq CPUVAR(VCPU),%r ## temp_reg ; \
+ movq CPUVAR(CURLWP),%r ## temp_reg ; \
+ cmpl $0, L_NOPREEMPT(%r ## temp_reg); \
+ jne 199f; \
+ cmpb $0, CPUVAR(ILEVEL); \
+ jne 199f; \
+ movl CPUVAR(IDEPTH), %e ## temp_reg; \
+ test %e ## temp_reg, %e ## temp_reg; \
+ jns 199f; \
+ movq $panicstr,%r ## temp_reg ; \
+ cmpq $0, 0(%r ## temp_reg); \
+ jne 199f; \
+ movq _C_LABEL(cli_panic), %rdi; \
+ call _C_LABEL(panic); \
+199: movq CPUVAR(VCPU),%r ## temp_reg ; \
movb $1,EVTCHN_UPCALL_MASK(%r ## temp_reg);
#define STI(temp_reg) \
movq CPUVAR(VCPU),%r ## temp_reg ; \
- movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg);
+ cmpb $0, EVTCHN_UPCALL_MASK(%r ## temp_reg); \
+ jne 198f; \
+ movq $panicstr,%r ## temp_reg ; \
+ cmpq $0, 0(%r ## temp_reg); \
+ jne 198f; \
+ movq _C_LABEL(sti_panic), %rdi; \
+ call _C_LABEL(panic); \
+198: movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg); \
+ movq CPUVAR(CURLWP),%r ## temp_reg ;
+#else
+
+#define CLI(temp_reg) \
+ movq CPUVAR(VCPU),%r ## temp_reg ; \
+ movb $1,EVTCHN_UPCALL_MASK(%r ## temp_reg);
-#define PUSHF(temp_reg) \
+#define STI(temp_reg) \
movq CPUVAR(VCPU),%r ## temp_reg ; \
- movzbl EVTCHN_UPCALL_MASK(%r ## temp_reg), %e ## temp_reg; \
- pushq %r ## temp_reg
+ movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg); \
-#define POPF \
- popq %rdi; \
- call _C_LABEL(xen_write_psl)
+#endif /* DIAGNOSTIC */
+/* CLI() with preemtion disabled */
+#define CLI2(temp_reg, temp_reg2) \
+ movq CPUVAR(CURLWP),% ## temp_reg2 ; \
+ incl L_NOPREEMPT(% ## temp_reg2); \
+ movq CPUVAR(VCPU),%r ## temp_reg ; \
+ movb $1,EVTCHN_UPCALL_MASK(%r ## temp_reg); \
+ decl L_NOPREEMPT(% ## temp_reg2);
#else /* XENPV */
#define XEN_ONLY2(x,y)
#define NOT_XEN(x) x
#define CLI(temp_reg) cli
+#define CLI2(temp_reg, temp_reg2) cli
#define STI(temp_reg) sti
-#define PUSHF(temp_reg) pushf
-#define POPL popl
#endif /* XENPV */
#define HP_NAME_CLAC 1
Index: xen/x86/xen_intr.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/x86/xen_intr.c,v
retrieving revision 1.31
diff -u -p -u -r1.31 xen_intr.c
--- xen/x86/xen_intr.c 25 Feb 2023 00:32:13 -0000 1.31
+++ xen/x86/xen_intr.c 13 Jan 2026 14:48:26 -0000
@@ -540,3 +547,9 @@ __strong_alias(cpu_intr_init, xen_cpu_in
__strong_alias(intr_allocate_io_intrsource, xen_intr_allocate_io_intrsource);
__strong_alias(intr_free_io_intrsource, xen_intr_free_io_intrsource);
#endif /* XENPV */
+
+
+#if defined(DIAGNOSTIC) && defined(XENPV)
+const char *cli_panic = "cli with preemtion";
+const char *sti_panic = "sti with interrupts enabled";
+#endif
- References:
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Prev by Date:
Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Next by Date:
Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Previous by Thread:
Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Next by Thread:
Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)
- Indexes:
Home |
Main Index |
Thread Index |
Old Index