[PATCH 2/4] unshare: new applet

Fri Mar 11 13:07:53 UTC 2016

Add a fully featured unshare implementation implementing all arguments
supported in the upstream version.

Signed-off-by: Bartosz Golaszewski <bartekgola at gmail.com>
---
 util-linux/unshare.c | 450 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 450 insertions(+)
 create mode 100644 util-linux/unshare.c

diff --git a/util-linux/unshare.c b/util-linux/unshare.c
new file mode 100644
index 0000000..41260d8
--- /dev/null
+++ b/util-linux/unshare.c
@@ -0,0 +1,450 @@
+/* vi: set sw=4 ts=4: */
+/*
+ * Mini unshare implementation for busybox.
+ *
+ * Copyright (C) 2016 by Bartosz Golaszewski <bartekgola at gmail.com>
+ *
+ * Licensed under GPLv2 or later, see file LICENSE in this source tree.
+ */
+
+//config:config UNSHARE
+//config:	bool "unshare"
+//config:	default y
+//config:	select PLATFORM_LINUX
+//config:	help
+//config:	  Run program with some namespaces unshared from parent.
+//config:
+//config:config FEATURE_UNSHARE_LONG_OPTS
+//config:	bool "enable long options"
+//config:	default y
+//config:	depends on UNSHARE && LONG_OPTS
+//config:	help
+//config:	  Support long options for the unshare applet. This makes
+//config:	  the busybox implementation more compatible with upstream.
+
+//applet:IF_UNSHARE(APPLET(unshare, BB_DIR_USR_BIN, BB_SUID_DROP))
+
+//kbuild:lib-$(CONFIG_UNSHARE) += unshare.o
+
+//usage:#define unshare_trivial_usage
+//usage:       "[options] <program> [args...]"
+//usage:#if ENABLE_FEATURE_UNSHARE_LONG_OPTS
+//usage:#define unshare_full_usage "\n\n"
+//usage:       "Options:"
+//usage:     "\n	-m, --mount[=<file>]		unshare mounts namespace"
+//usage:     "\n	-u, --uts[=<file>]		unshare UTS namespace (hostname etc.)"
+//usage:     "\n	-i, --ipc[=<file>]		unshare System V IPC namespace"
+//usage:     "\n	-n, --network[=<file>]		unshare network namespace"
+//usage:     "\n	-p, --pid[=<file>]		unshare pid namespace"
+//usage:     "\n	-U, --user[=<file>]		unshare user namespace"
+//usage:     "\n	-f, --fork			fork before launching <program>"
+//usage:     "\n	-M, --mount-proc[=<dir>]	mount proc filesystem first (implies --mount)"
+//usage:     "\n	-r, --map-root-user		map current user to root (implies --user)"
+//usage:     "\n	-P, --propagation slave|shared|private|unchanged"
+//usage:     "\n					modify mount propagation in mount namespace"
+//usage:     "\n	-s, --setgroups allow|deny	control the setgroups syscall in user namespaces"
+//usage:#else
+//usage:#define unshare_full_usage "\n\n"
+//usage:       "Options:"
+//usage:     "\n	-m [<file>]	unshare mounts namespace"
+//usage:     "\n	-u [<file>]	unshare UTS namespace (hostname etc.)"
+//usage:     "\n	-i [<file>]	unshare System V IPC namespace"
+//usage:     "\n	-n [<file>]	unshare network namespace"
+//usage:     "\n	-p [<file>]	unshare pid namespace"
+//usage:     "\n	-U [<file>]	unshare user namespace"
+//usage:     "\n	-f		fork before launching <program>"
+//usage:     "\n	-M [<dir>]	mount proc filesystem first (implies -m)"
+//usage:     "\n	-r		map current user to root (implies -u)"
+//usage:     "\n	-P slave|shared|private|unchanged"
+//usage:     "\n			modify mount propagation in mount namespace"
+//usage:     "\n	-s allow|deny	ontrol the setgroups syscall in user namespaces"
+//usage:#endif
+
+#include "libbb.h"
+
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+
+/*
+ * Longest possible path to a procfs file used in unshare. Must be able to
+ * contain the '/proc/' string, the '/ns/user' string which is the longest
+ * namespace name and a 32-bit integer representing the process ID.
+ */
+#define PROC_PATH_MAX   (sizeof("/proc//ns/user") + sizeof(pid_t) * 3)
+
+#define PATH_PROC_SETGROUPS	"/proc/self/setgroups"
+#define PATH_PROC_UIDMAP	"/proc/self/uid_map"
+#define PATH_PROC_GIDMAP	"/proc/self/gid_map"
+
+enum {
+	OPT_mount	= BIT( 0),
+	OPT_uts		= BIT( 1),
+	OPT_ipc		= BIT( 2),
+	OPT_network	= BIT( 3),
+	OPT_pid		= BIT( 4),
+	OPT_user	= BIT( 5),
+	OPT_fork	= BIT( 6),
+	OPT_mount_proc	= BIT( 7),
+	OPT_map_root	= BIT( 8),
+	OPT_propagation	= BIT( 9),
+	OPT_setgroups	= BIT(10),
+};
+
+struct namespace {
+	const int opt;
+	const int flag;
+	const char *nsfile;
+	char *path;
+};
+
+struct propagation_mode {
+	const char *name;
+	unsigned long flags;
+};
+
+/*
+ * Upstream unshare doesn't support short options for --mount-proc and
+ * --propagation, but let's add them here to let the user use them even with
+ * long options disabled in busybox config.
+ */
+static const char opt_str[] = "+m::u::i::n::p::U::fM::rP:s:";
+
+/*
+ * Upstream unshare only accepts optional arguments (namespace mountpoints)
+ * for long options. We support them for both short (for size reduction
+ * with LONG_OPTS disabled) and long opts (for upstream compatibility).
+ */
+#if ENABLE_FEATURE_UNSHARE_LONG_OPTS
+static const char unshare_longopts[] ALIGN1 =
+	"mount\0"		Optional_argument	"m"
+	"uts\0"			Optional_argument	"u"
+	"ipc\0"			Optional_argument	"i"
+	"network\0"		Optional_argument	"n"
+	"pid\0"			Optional_argument	"p"
+	"user\0"		Optional_argument	"U"
+	"fork\0"		No_argument		"f"
+	"mount-proc\0"		Optional_argument	"M"
+	"map-root-user\0"	No_argument		"r"
+	"propagation\0"		Required_argument	"P"
+	"setgroups\0"		Required_argument	"s";
+#endif
+
+static unsigned long parse_propagation(const char *prop_str)
+{
+	static const struct propagation_mode prop_modes[] = {
+		{
+			.name = "slave",
+			.flags = MS_REC | MS_SLAVE,
+		},
+		{
+			.name =	"private",
+			.flags = MS_REC | MS_PRIVATE,
+		},
+		{
+			.name = "shared",
+			.flags = MS_REC | MS_SHARED,
+		},
+		{
+			.name = "unchanged",
+			.flags = 0,
+		}
+	};
+
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(prop_modes); i++) {
+		if (strcmp(prop_modes[i].name, prop_str) == 0)
+			return prop_modes[i].flags;
+	}
+
+	bb_error_msg_and_die("unsupported propagation mode: %s", prop_str);
+}
+
+static ino_t get_mnt_ns_inode_by_pid(pid_t pid)
+{
+	char path[PROC_PATH_MAX];
+	struct stat statbuf;
+
+	snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid);
+	xstat(path, &statbuf);
+
+	return statbuf.st_ino;
+}
+
+static void mount_namespaces(pid_t pid,
+			     struct namespace *ns_list, size_t num_ns)
+{
+	char nsf[PROC_PATH_MAX];
+	struct namespace *ns;
+	int i, status;
+
+	for (i = 0; i < num_ns; i++) {
+		ns = &ns_list[i];
+
+		if (!ns->path)
+			continue;
+
+		snprintf(nsf, sizeof(nsf), "/proc/%d/ns/%s", pid, ns->nsfile);
+
+		status = mount(nsf, ns->path, NULL, MS_BIND, NULL);
+		if (status < 0) {
+			bb_perror_msg_and_die("mount %s on %s failed",
+					      nsf, ns->path);
+		}
+	}
+}
+
+static void mount_procfs(const char *target)
+{
+	int status;
+
+	status = mount("none", target, NULL, MS_PRIVATE | MS_REC, NULL);
+	if (status < 0)
+		goto mount_err;
+
+	status = mount("proc", target, "proc",
+		       MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL);
+	if (status < 0)
+		goto mount_err;
+
+	return;
+
+mount_err:
+	bb_perror_msg_and_die("mount %s failed", target);
+}
+
+int unshare_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
+int unshare_main(int argc UNUSED_PARAM, char **argv)
+{
+	static struct namespace ns_list[] = {
+		{
+#define NS_MNT_POS 0
+			.opt = OPT_mount,
+			.flag = CLONE_NEWNS,
+			.nsfile = "mnt",
+		},
+		{
+#define NS_UTS_POS 1
+			.opt = OPT_uts,
+			.flag = CLONE_NEWUTS,
+			.nsfile = "uts",
+		},
+		{
+#define NS_IPC_POS 2
+			.opt = OPT_ipc,
+			.flag = CLONE_NEWIPC,
+			.nsfile = "ipc",
+		},
+		{
+#define NS_NET_POS 3
+			.opt = OPT_network,
+			.flag = CLONE_NEWNET,
+			.nsfile = "net",
+		},
+		{
+#define NS_PID_POS 4
+			.opt = OPT_pid,
+			.flag = CLONE_NEWPID,
+			.nsfile = "pid",
+		},
+		{
+#define NS_USR_POS 5
+			.opt = OPT_user,
+			.flag = CLONE_NEWUSER,
+			.nsfile = "user",
+		},
+	};
+
+	int unsflags = 0, i, need_mount = 0, status, setgrp_allow = 0;
+	const char *proc_mnt_target = "/proc", *prop_str, *setgrp_str;
+	unsigned long prop_flags = MS_REC | MS_PRIVATE;
+	uid_t reuid = geteuid();
+	gid_t regid = getegid();
+	unsigned int opts;
+	pid_t pid = -1;
+
+	IF_FEATURE_UNSHARE_LONG_OPTS(applet_long_options = unshare_longopts);
+
+	opts = getopt32(argv, opt_str,
+			&ns_list[NS_MNT_POS].path, &ns_list[NS_UTS_POS].path,
+			&ns_list[NS_IPC_POS].path, &ns_list[NS_NET_POS].path,
+			&ns_list[NS_PID_POS].path, &ns_list[NS_USR_POS].path,
+			&proc_mnt_target, &prop_str, &setgrp_str);
+	argv += optind;
+
+	/*
+	 * Mounting the proc filesystem before running the program implies
+	 * creating a new mount namespace since the /proc mount would
+	 * otherwise mess up existing programs on the system.
+	 */
+	if (opts & OPT_mount_proc)
+		opts |= OPT_mount;
+
+	/* Mapping user and group IDs to root implies --user. */
+	if (opts & OPT_map_root)
+		opts |= OPT_user;
+
+	if (opts & OPT_setgroups) {
+		if (strcmp(setgrp_str, "allow") == 0) {
+			setgrp_allow = 1;
+		} else if (strcmp(setgrp_str, "deny") == 0) {
+			setgrp_allow = 0;
+		} else {
+			bb_error_msg_and_die(
+				"unsupported --setgroups argument '%s'",
+				setgrp_str);
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ns_list); i++) {
+		struct namespace *ns = &ns_list[i];
+
+		if (opts & ns->opt)
+			unsflags |= ns->flag;
+
+		if (ns->path)
+			need_mount = 1;
+	}
+
+	/* Silently ignore --propagation if --mount is not requested. */
+	if ((opts & OPT_propagation) && (opts & OPT_mount))
+		prop_flags = parse_propagation(prop_str);
+
+	/*
+	 * Special case: if we were requested to unshare the mount namespace
+	 * AND to make any namespace persistent (by bind mounting it) we need
+	 * to spawn a child process which will wait for the parent to call
+	 * unshare(), then mount parent's namespaces while still in the
+	 * previous namespace.
+	 */
+	if (need_mount && (opts & OPT_mount)) {
+		ino_t inop, inoc;
+		pid_t ppid;
+
+		/*
+		 * Can't use getppid() in child, as we can be unsharing the
+		 * pid namespace.
+		 */
+		ppid = getpid();
+
+		/*
+		 * Save current process' mount namespace file inode number. We
+		 * will later use it in child process to check if it already
+		 * changed meaning that this process already called unshare().
+		 */
+		inop = get_mnt_ns_inode_by_pid(ppid);
+
+		pid = xfork();
+		if (pid == 0) {
+			/*
+			 * Child - wait until parent calls unshare(). No issue
+			 * in busy-waiting - by the time we get here from
+			 * fork(), the parent has usually already unshared the
+			 * mount namespace. We should spin a few times at most.
+			 *
+			 * XXX Should probably use a pipe to notify the child
+			 * about completing unshare().
+			 */
+			do {
+				inoc = get_mnt_ns_inode_by_pid(ppid);
+			} while (inoc == inop);
+
+			/* Mount parent's unshared namespaces. */
+			mount_namespaces(ppid, ns_list, ARRAY_SIZE(ns_list));
+
+			return EXIT_SUCCESS;
+		} /* Parent continues. */
+	}
+
+	status = unshare(unsflags);
+	if (status < 0)
+		bb_perror_msg_and_die("unshare failed");
+
+	if (need_mount) {
+		/* Wait for the child to finish mounting the namespaces. */
+		if (opts & OPT_mount) {
+			int exit_status;
+
+			status = safe_waitpid(pid, &exit_status, 0);
+			if (status < 0)
+				bb_perror_msg_and_die("waitpid");
+
+			if (WIFEXITED(exit_status) &&
+			    WEXITSTATUS(exit_status) != EXIT_SUCCESS)
+				return WEXITSTATUS(status);
+		} else {
+			/*
+			 * Regular way - we were requested to mount some other
+			 * namespaces: mount them after the call to unshare().
+			 */
+			mount_namespaces(getpid(), ns_list,
+					 ARRAY_SIZE(ns_list));
+		}
+	}
+
+	/*
+	 * When we're unsharing the pid namespace, it's not the process that
+	 * calls unshare() that is put into the new namespace, but its first
+	 * child. The user may want to use this option to spawn a new process
+	 * that'll become PID 1 in this new namespace.
+	 */
+	if (opts & OPT_fork) {
+		int exit_status;
+
+		pid = xfork();
+		if (pid > 0) {
+			status = safe_waitpid(pid, &exit_status, 0);
+			if (status < 0)
+				bb_perror_msg_and_die("waitpid");
+
+			if (WIFEXITED(exit_status))
+				return WEXITSTATUS(exit_status);
+			else if (WIFSIGNALED(exit_status))
+				kill(getpid(), WTERMSIG(exit_status));
+
+			bb_error_msg_and_die("child exit failed");
+		} /* Child continues. */
+	}
+
+	if (opts & OPT_map_root) {
+		char uidmap_buf[sizeof(unsigned int) * 3 + sizeof(" 0 1")];
+
+		if ((opts & OPT_setgroups) && setgrp_allow) {
+			bb_error_msg_and_die(
+				"options --setgroups=allow and --map-root-user are mutually exclusive");
+		}
+
+		/*
+		 * Since Linux 3.19 unprivileged writing of /proc/self/gid_map
+		 * has s been disabled unless /proc/self/setgroups is written
+		 * first to permanently disable the ability to call setgroups
+		 * in that user namespace.
+		 */
+		xopen_xwrite_close(PATH_PROC_SETGROUPS, "deny");
+		snprintf(uidmap_buf, COMMON_BUFSIZE, "%u 0 1", reuid);
+		xopen_xwrite_close(PATH_PROC_UIDMAP, uidmap_buf);
+		snprintf(uidmap_buf, COMMON_BUFSIZE, "%u 0 1", regid);
+		xopen_xwrite_close(PATH_PROC_GIDMAP, uidmap_buf);
+	} else if (opts & OPT_setgroups) {
+		xopen_xwrite_close(PATH_PROC_SETGROUPS, setgrp_str);
+	}
+
+	if (opts & OPT_mount) {
+		status = mount("none", "/", NULL, prop_flags, NULL);
+		if (status < 0) {
+			bb_perror_msg_and_die(
+				"cannot change root filesystem propagation");
+		}
+	}
+
+	if (opts & OPT_mount_proc)
+		mount_procfs(proc_mnt_target);
+
+	if (*argv) {
+		execvp(*argv, argv);
+		bb_perror_msg_and_die("failed to execute %s", *argv);
+	}
+
+	run_shell(getenv("SHELL"), 0, NULL, NULL);
+}
-- 
2.1.4