NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: port-xen/58561 (panic: kernel diagnostic assertion, "x86_read_psl() == 0" failed: file, "/home/netbsd/10/src/sys/arch/x86/x86/pmap.c", line 3581)



On Tue, Jan 13, 2026 at 03:23:14PM +0100, Manuel Bouyer wrote:
> On Mon, Jan 12, 2026 at 11:20:34PM +0100, Manuel Bouyer wrote:
> > On Sun, Jan 11, 2026 at 08:35:10PM -0800, Konrad Schroder wrote:
> > > On 1/10/2026 2:33 PM, Manuel Bouyer wrote:
> > > > And this assembly work may have allowed me to find the problem.
> > > > Basically, the CLI and STI macros are not atomic on Xen PV, and if preemption
> > > > happens at a bad time we may end up updating the upcall_mask of our previous
> > > > CPU. The attached patch should close this race condition; hopefully it's
> > > > the last one.
> > > 
> > > I can confirm that, at least in my case, this allows me to build
> > > continuously all day without a crash.
> > 
> > Good, it points that we're on the right track.
> > Can you try the attached patch instead ? It tries to limit the extra
> > instructions to only places where it's needed, and add DIAGNOSTIC code
> > to check it's really not needed in other places.
> 
> Sorry this patch has a bug; working on an updated version
> stay tuned.

Here's the updated patch

-- 
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
     NetBSD: 26 ans d'experience feront toujours la difference
--
Index: amd64/amd64/amd64_trap.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/amd64_trap.S,v
retrieving revision 1.55
diff -u -p -u -r1.55 amd64_trap.S
--- amd64/amd64/amd64_trap.S	27 Feb 2023 16:24:28 -0000	1.55
+++ amd64/amd64/amd64_trap.S	13 Jan 2026 14:48:25 -0000
@@ -90,7 +90,7 @@
  */
 
 #ifdef	XENPV
-#define	PRE_TRAP	CLI(cx); movq (%rsp),%rcx ; movq 8(%rsp),%r11 ; addq $0x10,%rsp
+#define	PRE_TRAP	CLI2(cx,r11); movq (%rsp),%rcx ; movq 8(%rsp),%r11 ; addq $0x10,%rsp
 #else
 #define	PRE_TRAP
 #endif
@@ -671,7 +671,7 @@ calltrap:
 .Lalltraps_checkast:
 	movq	CPUVAR(CURLWP),%r14
 	/* Check for ASTs on exit to user mode. */
-	CLI(si)
+	CLI2(si, rdi)
 	CHECK_ASTPENDING(%r14)
 	je	3f
 	CLEAR_ASTPENDING(%r14)
Index: amd64/amd64/cpufunc.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/cpufunc.S,v
retrieving revision 1.70
diff -u -p -u -r1.70 cpufunc.S
--- amd64/amd64/cpufunc.S	6 Sep 2025 02:53:22 -0000	1.70
+++ amd64/amd64/cpufunc.S	13 Jan 2026 14:48:25 -0000
@@ -184,7 +184,7 @@ ENTRY(wbinvd)
 END(wbinvd)
 
 ENTRY(setusergs)
-	CLI(ax)
+	CLI2(ax, rdi)
 	swapgs
 	movw	%di, %gs
 	swapgs
Index: amd64/amd64/locore.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/locore.S,v
retrieving revision 1.235
diff -u -p -u -r1.235 locore.S
--- amd64/amd64/locore.S	31 Dec 2025 15:33:50 -0000	1.235
+++ amd64/amd64/locore.S	13 Jan 2026 14:48:25 -0000
@@ -1641,7 +1641,7 @@ ENTRY(handle_syscall)
 	 * to ensure we don't take an interrupt with some of the user
 	 * registers loaded.
 	 */
-	CLI(si)
+	CLI2(si, rax)
 	/* Check for ASTs on exit to user mode. */
 	movl	L_MD_ASTPENDING(%r14),%eax
 	orl	CPUVAR(WANT_PMAPLOAD),%eax
@@ -1768,7 +1768,9 @@ IDTVEC(\name)
 	 * But it didn't disable events
 	 */
 	pushq	%rsi
-	CLI(si)
+	pushq	%rdi
+	CLI2(si, rdi)
+	popq	%rdi
 	popq	%rsi
 	addq	$0x10,%rsp	/* gap to match cs:rip */
 	pushq	$2		/* error code */
@@ -1813,7 +1815,9 @@ IDTVEC_END(syscall32)
 IDTVEC(osyscall)
 #ifdef XENPV
 	pushq	%rsi
-	CLI(si)
+	pushq	%rdi
+	CLI2(si, rdi)
+	popq	%rdi
 	popq	%rsi
 	movq (%rsp),%rcx
 	movq 8(%rsp),%r11
Index: amd64/amd64/spl.S
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/spl.S,v
retrieving revision 1.49
diff -u -p -u -r1.49 spl.S
--- amd64/amd64/spl.S	1 Mar 2023 08:38:50 -0000	1.49
+++ amd64/amd64/spl.S	13 Jan 2026 14:48:25 -0000
@@ -313,6 +313,7 @@ END(spllower)
  * a lower-prio one first, which needs to take the kernel lock -->
  * the sending CPU will never see the that CPU accept the IPI
  * (see pmap_tlb_shootnow).
+ * XenPV note: it seems that we can be called at spl0, so need to use CLI2()
  */
 IDTVEC(spllower)
 	pushq	%rbx
@@ -323,7 +324,7 @@ IDTVEC(spllower)
 1:
 	movl	%ebx,%eax		/* get cpl */
 	movq	CPUVAR(IUNMASK)(,%rax,8),%rax
-	CLI(si)
+	CLI2(si, r12)
 	andq	CPUVAR(IPENDING),%rax	/* any non-masked bits left? */
 	jz	2f
 	bsrq	%rax,%rax
@@ -355,7 +356,7 @@ IDTVEC(doreti)
 1:
 	movl    %ebx,%eax
 	movq	CPUVAR(IUNMASK)(,%rax,8),%rax
-	CLI(si)
+	CLI2(si, r14)
 	andq	CPUVAR(IPENDING),%rax
 	jz	2f
 	bsrq	%rax,%rax		/* slow, but not worth optimizing */
@@ -378,7 +379,7 @@ IDTVEC(doreti)
 	movq	%rsp,%rdi
 	KMSAN_INIT_ARG(8)
 	call	_C_LABEL(trap)
-	CLI(si)
+	CLI2(si, rdi)
 	jmp	.Ldoreti_checkast
 3:
 	CHECK_DEFERRED_SWITCH
@@ -389,6 +390,6 @@ IDTVEC(doreti)
 9:
 	STI(si)
 	call	_C_LABEL(do_pmap_load)
-	CLI(si)
+	CLI2(si, rdi)
 	jmp	.Ldoreti_checkast		/* recheck ASTs */
 IDTVEC_END(doreti)
Index: amd64/include/frameasm.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/frameasm.h,v
retrieving revision 1.55
diff -u -p -u -r1.55 frameasm.h
--- amd64/include/frameasm.h	30 Jul 2022 14:11:00 -0000	1.55
+++ amd64/include/frameasm.h	13 Jan 2026 14:48:25 -0000
@@ -23,31 +23,68 @@
 #define	XEN_ONLY2(x,y)	x,y
 #define	NOT_XEN(x)
 
+#ifdef DIAGNOSTIC
+/*
+ * acessing EVTCHN_UPCALL_MASK is save only if preemption is disabled, i.e.:
+ * l_nopreempt is not 0, or
+ * ci_ilevel is nit 0, or
+ * EVTCHN_UPCALL_MASK is not 0
+ * ci_idepth is not negative
+ */
 #define CLI(temp_reg) \
- 	movq CPUVAR(VCPU),%r ## temp_reg ;			\
+ 	movq CPUVAR(CURLWP),%r ## temp_reg ;			\
+	cmpl $0, L_NOPREEMPT(%r ## temp_reg);			\
+	jne 199f;						\
+	cmpb $0, CPUVAR(ILEVEL);				\
+	jne 199f;						\
+	movl CPUVAR(IDEPTH), %e ## temp_reg;			\
+	test %e ## temp_reg, %e ## temp_reg;			\
+	jns 199f;						\
+ 	movq $panicstr,%r ## temp_reg ;			 	\
+	cmpq $0, 0(%r ## temp_reg);				\
+	jne 199f;						\
+	movq _C_LABEL(cli_panic), %rdi;				\
+	call _C_LABEL(panic);					\
+199:	movq CPUVAR(VCPU),%r ## temp_reg ;			\
 	movb $1,EVTCHN_UPCALL_MASK(%r ## temp_reg);
 
 #define STI(temp_reg) \
  	movq CPUVAR(VCPU),%r ## temp_reg ;			\
-	movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg);
+	cmpb $0, EVTCHN_UPCALL_MASK(%r ## temp_reg);		\
+	jne 198f;						\
+ 	movq $panicstr,%r ## temp_reg ;				\
+	cmpq $0, 0(%r ## temp_reg);				\
+	jne 198f;						\
+	movq _C_LABEL(sti_panic), %rdi;				\
+	call _C_LABEL(panic);					\
+198:	movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg);		\
+ 	movq CPUVAR(CURLWP),%r ## temp_reg ;
+#else
+
+#define CLI(temp_reg) \
+ 	movq CPUVAR(VCPU),%r ## temp_reg ;			\
+	movb $1,EVTCHN_UPCALL_MASK(%r ## temp_reg);
 
-#define PUSHF(temp_reg) \
+#define STI(temp_reg) \
  	movq CPUVAR(VCPU),%r ## temp_reg ;			\
-	movzbl EVTCHN_UPCALL_MASK(%r ## temp_reg), %e ## temp_reg; \
-	pushq %r ## temp_reg
+	movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg);		\
 
-#define POPF \
-	popq %rdi; \
-	call _C_LABEL(xen_write_psl)
+#endif /* DIAGNOSTIC */
 
+/* CLI() with preemtion disabled */
+#define CLI2(temp_reg, temp_reg2) \
+ 	movq CPUVAR(CURLWP),% ## temp_reg2 ;			\
+	incl L_NOPREEMPT(% ## temp_reg2);			\
+ 	movq CPUVAR(VCPU),%r ## temp_reg ;			\
+	movb $1,EVTCHN_UPCALL_MASK(%r ## temp_reg);		\
+	decl L_NOPREEMPT(% ## temp_reg2);
 
 #else /* XENPV */
 #define	XEN_ONLY2(x,y)
 #define	NOT_XEN(x)	x
 #define CLI(temp_reg) cli
+#define CLI2(temp_reg, temp_reg2) cli
 #define STI(temp_reg) sti
-#define PUSHF(temp_reg) pushf
-#define POPL popl
 #endif	/* XENPV */
 
 #define HP_NAME_CLAC		1
Index: xen/x86/xen_intr.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/x86/xen_intr.c,v
retrieving revision 1.31
diff -u -p -u -r1.31 xen_intr.c
--- xen/x86/xen_intr.c	25 Feb 2023 00:32:13 -0000	1.31
+++ xen/x86/xen_intr.c	13 Jan 2026 14:48:26 -0000
@@ -540,3 +547,9 @@ __strong_alias(cpu_intr_init, xen_cpu_in
 __strong_alias(intr_allocate_io_intrsource, xen_intr_allocate_io_intrsource);
 __strong_alias(intr_free_io_intrsource, xen_intr_free_io_intrsource);
 #endif /* XENPV */
+
+
+#if defined(DIAGNOSTIC) && defined(XENPV)
+const char *cli_panic = "cli with preemtion";
+const char *sti_panic = "sti with interrupts enabled";
+#endif


Home | Main Index | Thread Index | Old Index