# From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
# 
# s390 normally doesn't support a method, that allows us to force
# the host to skip its syscall restart handling.
# I implemented a new method in the host, which also is posted to
# LKML to hopefully be inserted in s390 mainline.
# To check availability of this change, I added a new check, which
# is done in a slightly different way for the other arches, too.
# Success in check_ptrace() and success in the new check are
# absolutely necessary for UML to run in any mode.
# So I changed the sequence of checks to:
# 1) check_ptrace() being called at startup very early
# 2) check_ptrace() calls the new check, too
# 3) can_do_skas() is called after check_ptrace()
# check_ptrace() will never return, if it fails, but it now uses
# printf() and exit() instead of panic().
# 
# Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Index: linux-2.6.17/arch/um/include/common-offsets.h
===================================================================
--- linux-2.6.17.orig/arch/um/include/common-offsets.h	2007-10-24 10:04:50.000000000 -0400
+++ linux-2.6.17/arch/um/include/common-offsets.h	2007-11-19 11:14:27.000000000 -0500
@@ -10,6 +10,8 @@ DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK);
 DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT);
 DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 
+DEFINE(UM_ERESTARTNOINTR, ERESTARTNOINTR);
+
 DEFINE_STR(UM_KERN_EMERG, KERN_EMERG);
 DEFINE_STR(UM_KERN_ALERT, KERN_ALERT);
 DEFINE_STR(UM_KERN_CRIT, KERN_CRIT);
Index: linux-2.6.17/arch/um/include/sysdep-i386/ptrace_user.h
===================================================================
--- linux-2.6.17.orig/arch/um/include/sysdep-i386/ptrace_user.h	2007-10-17 12:11:52.000000000 -0400
+++ linux-2.6.17/arch/um/include/sysdep-i386/ptrace_user.h	2007-11-19 11:14:27.000000000 -0500
@@ -15,6 +15,8 @@
 #define PT_SYSCALL_NR(regs) ((regs)[ORIG_EAX])
 #define PT_SYSCALL_NR_OFFSET PT_OFFSET(ORIG_EAX)
 
+#define PT_SYSCALL_NR_SKIP_RESTART __NR_getpid
+
 #define PT_SYSCALL_ARG1_OFFSET PT_OFFSET(EBX)
 #define PT_SYSCALL_ARG2_OFFSET PT_OFFSET(ECX)
 #define PT_SYSCALL_ARG3_OFFSET PT_OFFSET(EDX)
Index: linux-2.6.17/arch/um/include/sysdep-x86_64/ptrace_user.h
===================================================================
--- linux-2.6.17.orig/arch/um/include/sysdep-x86_64/ptrace_user.h	2007-10-17 12:11:52.000000000 -0400
+++ linux-2.6.17/arch/um/include/sysdep-x86_64/ptrace_user.h	2007-11-19 11:14:27.000000000 -0500
@@ -18,6 +18,8 @@
 #define PT_SYSCALL_NR(regs) ((regs)[PT_INDEX(ORIG_RAX)])
 #define PT_SYSCALL_NR_OFFSET (ORIG_RAX)
 
+#define PT_SYSCALL_NR_SKIP_RESTART __NR_getpid
+
 #define PT_SYSCALL_ARG1(regs) (((unsigned long *) (regs))[PT_INDEX(RDI)])
 #define PT_SYSCALL_ARG1_OFFSET (RDI)
 
Index: linux-2.6.17/arch/um/os-Linux/start_up.c
===================================================================
--- linux-2.6.17.orig/arch/um/os-Linux/start_up.c	2007-11-19 10:56:39.000000000 -0500
+++ linux-2.6.17/arch/um/os-Linux/start_up.c	2007-11-19 11:18:15.000000000 -0500
@@ -25,6 +25,18 @@
 #include "registers.h"
 #include "skas_ptrace.h"
 
+/* In some cases, host's headers don't provide ERESTARTNOINTR.
+ * If so, we use the definition from our own kernel headers.
+ * As kernel headers must not be included in this user-obj, we
+ * provide kernel's ERESTARTNOINTR as UM_ERESTARTNOINTR in
+ * kern_constants.h.
+ * Here we trust in ERESTARTNOINTR being a part of the ABI for
+ * strace and debuggers, so it shouldn't change.
+ */
+#ifndef ERESTARTNOINTR
+#define ERESTARTNOINTR UM_ERESTARTNOINTR
+#endif
+
 static int ptrace_child(void)
 {
 	int ret;
@@ -59,6 +71,12 @@ static int ptrace_child(void)
 		 * SKAS3/2.6 patch before release -V6, together with a bug in
 		 * the UML code itself.
 		 */
+
+		 /*
+		  * In ckeck_skas_restart_skip, this is the expected
+		  * case, if everything works fine. (see below for
+		  * additional info)
+		  */
 		ret = 2;
 	_exit(ret);
 }
@@ -112,7 +130,7 @@ static int start_ptraced_child(void)
 }
 
 /* When testing for SYSEMU support, if it is one of the broken versions, we
- * must just avoid using sysemu, not panic, but only if SYSEMU features are
+ * must just avoid using sysemu, not exit, but only if SYSEMU features are
  * broken.
  * So only for SYSEMU features we test mustpanic, while normal host features
  * must work anyway!
@@ -344,6 +362,9 @@ void __init os_early_checks(void)
 	/* Print out the core dump limits early */
 	check_coredump_limit();
 
+	/* We check ptrace capabilities for UML's minimum need
+	 * This won't return, if ptrace isn't sufficient
+	 */
 	check_ptrace();
 
 	/* Need to check this early because mmapping happens before the
@@ -388,6 +409,124 @@ __uml_setup("noptraceldt", noptraceldt_c
 "    To support PTRACE_LDT, the host needs to be patched using\n"
 "    the current skas3 patch.\n\n");
 
+
+/*
+ * check_skas_restart_skip() will check, if the host can be forced to
+ * not do syscall restarting, even if the result of a UML-syscall is
+ * -ERESTARTxxxxx. Normally the -ERESTARTxxxxx result isn't passed to
+ * the user, because UML will replace it by -EINTR or the syscall-#
+ * to restart the syscall. So, the host won't see -ERESTARTxxxx in this
+ * case.
+ * But if the syscall in UML is sys_(rt)sigreturn, *every* result may
+ * be passed to the user, because it isn't a real result, but the
+ * interrupted processes register contents. So the host might see
+ * -ERESTARTxxxxx and could do syscall-restarting. This would produce
+ * unpredictible errors in UML.
+ * Thus, we have to force the host to skip syscall-restarting.
+ * Normally, this can be done by setting the syscall-# to -1 on exit
+ * from syscall (e.g. i386, x86_64). But that wouldn't work for s390,
+ * because s390 has syscall-result and syscall-# in the same register.
+ * s390 instead has a "trap" value in host's pt_regs, to distinguish
+ * between syscall / non-syscall. Unfortunately, "trap" isn't readable
+ * or writeable via ptrace.
+ * So, I modified the s390-host in a way, that it will change "trap",s
+ * if the syscall-# is written to -1 on syscall-entry.
+ * check_skas_restart_skip() will check presence of the change in
+ * s390-host. Also, it checks the "normal" way of skipping for the
+ * other arches.
+ * If syscall restarting can't be skipped, there is no safe way to run
+ * skas, no matter if skas0 or skas3.
+ * Due to the different syscall-handling in tt, a good result from
+ * check_ptrace is enough to run tt.
+ *
+ * What does the check:
+ * When the ptraced child is stopped at syscall entry, the syscall-# is
+ * overwritten by PT_SYSCALL_NR_SKIP_RESTART, which normally is
+ * __NR_getpid, so nothing is changed. But on s390 -1 is written, what
+ * should change "trap" in the host.
+ * Then the syscall is resumed with PTRACE_SYSCALL. On syscall exit,
+ * the result is replaced by -ERESTARTNOINTR, and if syscall-# and result
+ * are in different registers, the syscall-# is written with -1 (not on
+ * s390).
+ * Then, we send a SIGUSR1 to the child, forcing the host to do signal
+ * processing, which normally would cause syscall restarting. We
+ * intercept and suppress the signal via ptrace.
+ * Now, the ptraced child is resumed and the result is checked
+ * (stop_ptraced_child). What we expect is, that -ERESTARTNOINTR is the
+ * result of the syscall. If the host would do restarting, getpid()
+ * would be done again, and the result would be the child's pid.
+ * So the expected result of the child must be 2.
+ */
+static inline int check_skas_restart_skip(void)
+{
+	int pid, syscall, n, status;
+
+	non_fatal("Checking if syscall restart handling in host can be "
+		  "skipped...");
+	pid = start_ptraced_child();
+
+	while(1){
+		if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+			fatal_perror("check_skas_restart_skip : ptrace failed");
+
+		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
+		if(n < 0)
+			fatal_perror("check_skas_restart_skip : wait failed");
+		if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
+			fatal("check_skas_restart_skip : expected SIGTRAP, "
+			      "got status = %d", status);
+
+		syscall = ptrace(PTRACE_PEEKUSR, pid, PT_SYSCALL_NR_OFFSET, 0);
+		if(syscall == __NR_getpid){
+			n = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
+				   PT_SYSCALL_NR_SKIP_RESTART);
+			if(n < 0)
+				fatal_perror("check_skas_restart_skip : failed "
+					     "to modify system call");
+			break;
+		}
+	}
+
+	if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+		fatal_perror("check_skas_restart_skip : ptrace failed");
+
+	CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
+	if(n < 0)
+		fatal_perror("check_skas_restart_skip : wait failed");
+	if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
+		fatal("check_skas_restart_skip : expected SIGTRAP, "
+		      "got status = %d", status);
+
+	n = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_RET_OFFSET, -ERESTARTNOINTR);
+	if(n < 0)
+		fatal_perror("check_skas_restart_skip : failed to modify "
+			     "system call result");
+
+	if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET){
+		n = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET, -1);
+		if(n < 0)
+			fatal_perror("check_skas_restart_skip : failed to "
+				     "modify system call number");
+	}
+
+	kill(pid, SIGUSR1);
+
+	if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+		fatal_perror("check_skas_restart_skip : ptrace failed");
+
+	CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
+	if(n < 0)
+		fatal_perror("check_skas_restart_skip : wait failed");
+	if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGUSR1))
+		fatal("check_skas_restart_skip : expected SIGTRAP, "
+		      "got status = %d", status);
+
+	n = stop_ptraced_child(pid, 2, 0);
+	non_fatal("%s\n", n ? "failed" : "OK");
+
+	return n;
+}
+
 static inline void check_skas3_ptrace_faultinfo(void)
 {
 	struct ptrace_faultinfo fi;
@@ -468,6 +607,9 @@ static inline void check_skas3_proc_mm(v
 
 int can_do_skas(void)
 {
+	if(check_skas_restart_skip())
+		return 0;
+
 	non_fatal("Checking for the skas3 patch in the host:\n");
 
 	check_skas3_proc_mm();
Index: linux-2.6.17/arch/um/os-Linux/skas/process.c
===================================================================
--- linux-2.6.17.orig/arch/um/os-Linux/skas/process.c	2007-11-19 11:12:54.000000000 -0500
+++ linux-2.6.17/arch/um/os-Linux/skas/process.c	2007-11-19 11:18:49.000000000 -0500
@@ -143,7 +143,7 @@ static void handle_trap(int pid, struct 
 	if (!local_using_sysemu)
 	{
 		err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
-			     __NR_getpid);
+			     PT_SYSCALL_NR_SKIP_RESTART);
 		if (err < 0)
 			panic("handle_trap - nullifying syscall failed, "
 			      "errno = %d\n", errno);