--- /dev/null
+/* Unshare daemonizer.
+ * Written by Mike Frysinger <vapier@gmail.com>
+ * Released into the public domain.
+ */
+
+/* TODO:
+ * - Add userns support.
+ * - Make pidns init optional.
+ * - Make setproctitle nicer and include program argv[0].
+ */
+
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define PROG "vunshare"
+
+static bool vunshare(int flags)
+{
+ if (unshare(flags) == -1) {
+ if (errno != EINVAL)
+ err(1, "unshare failed");
+ return false;
+ }
+ return true;
+}
+
+static void unshare_net(void)
+{
+ if (!vunshare(CLONE_NEWNET))
+ return;
+
+ int sock = socket(AF_LOCAL, SOCK_DGRAM|SOCK_CLOEXEC, 0);
+ struct ifreq ifr;
+
+ /* Equiv of `ip link set up lo`. Kernel will assign 127.0.0.1 for us. */
+ strcpy(ifr.ifr_name, "lo");
+ if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0)
+ err(1, "ioctl(SIOCGIFFLAGS) failed");
+ strcpy(ifr.ifr_name, "lo");
+ ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+ if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+ err(1, "ioctl(SIOCSIFFLAGS) failed");
+}
+
+static char **title_argv;
+static void setproctitle(const char *title)
+{
+ /* Hopefully 1k is all we ever need. */
+ char newtitle[1024];
+ memset(newtitle, 0, sizeof(newtitle));
+ int len = sprintf(newtitle, "%s: %s [pid ns]", PROG, title);
+
+ prctl(PR_SET_NAME, (uintptr_t)newtitle);
+
+ /* Clobber argv to set the title. Need to figure out how much space though. */
+ int argc = 0;
+ size_t i = 0;
+ while (title_argv[argc])
+ i += strlen(title_argv[argc++]) + 1;
+ /* Now scan the environ table. */
+ while (title_argv[argc])
+ i += strlen(title_argv[argc++]) + 1;
+
+ if (i < len)
+ newtitle[i] = '\0';
+ /* This will NUL pad the string for us too. */
+ strncpy(title_argv[0], newtitle, i);
+}
+
+static void close_fds(void)
+{
+ int i;
+ for (i = 3; i < 10; ++i)
+ close(i);
+}
+
+static void exit_as_status_ext(int status)
+{
+ int sig_status = 0;
+ int exit_status = WEXITSTATUS(status);
+
+ if (WIFSIGNALED(status)) {
+ sig_status = WTERMSIG(status);
+ } else if (exit_status > 128) {
+ /* For the external init, translate the signal status back.
+ * TODO: This gets it wrong when the child actually exited.
+ * We need to set up a pipe between the two inits so we can
+ * get back the proper details.
+ */
+ sig_status = exit_status - 128;
+ }
+
+ if (sig_status) {
+ signal(sig_status, SIG_DFL);
+ kill(getpid(), sig_status);
+
+ /* Still here ? Maybe the signal was masked. Just exit. */
+ exit_status = 128 + sig_status;
+ }
+
+ exit(exit_status);
+}
+
+static void exit_as_status_int(int status)
+{
+ /* If we are the init for the pid ns, we can't kill ourselves --
+ * the kernel explicitly disallows this. Just exit with a high
+ * status value instead. Our parent will handle it themselves.
+ */
+ int exit_status;
+
+ if (WIFSIGNALED(status))
+ exit_status = 128 + WTERMSIG(status);
+ else
+ exit_status = WEXITSTATUS(status);
+
+ exit(exit_status);
+}
+
+static int reap_children(void)
+{
+ pid_t pid;
+ int status = 1;
+ while (1) {
+ pid = wait(&status);
+ if (pid == -1)
+ break;
+ }
+ return status;
+}
+
+static pid_t child_pid;
+static void signal_passthru(int sig, siginfo_t *siginfo, void *context)
+{
+ if (getpid() == 1) {
+ /* Internal init. */
+
+ /* If the signal is coming from our children, ignore it.
+ * If it's coming from outside the pid ns, pass it along.
+ */
+ if (siginfo->si_pid != 0)
+ return;
+
+ /* Kill all the children! */
+ kill(-1, sig);
+ } else {
+ /* External init. */
+
+ /* Just forward signal to the child. */
+ kill(child_pid, sig);
+ }
+}
+
+/* We want to forward some signals to the child process. Block the rest.
+ * We don't actually exit as we wait for the child to die/process the signal
+ * first, and then we'll kill/exit after that point.
+ */
+static void setup_signal_handler(pid_t pid)
+{
+ int i;
+
+ struct sigaction sa = {
+ .sa_sigaction = signal_passthru,
+ .sa_flags = SA_SIGINFO | SA_RESTART,
+ };
+
+ child_pid = pid;
+
+ for (i = 1; i < SIGUNUSED; ++i)
+ if (sigaction(i, &sa, NULL) && errno != EINVAL)
+ fprintf(stderr, "sigaction(%i) failed: %s\n", i, strerror(errno));
+ for (i = SIGRTMIN; i <= SIGRTMAX; ++i)
+ if (sigaction(i, &sa, NULL) && errno != EINVAL)
+ fprintf(stderr, "sigaction(%i) failed: %s\n", i, strerror(errno));
+
+ /* As an init, we will reap the children via wait(). */
+ signal(SIGCHLD, SIG_DFL);
+}
+
+static bool unshare_pid(bool daemonize)
+{
+ if (!vunshare(CLONE_NEWPID))
+ return false;
+
+ pid_t pid;
+
+ /* Set up external init process. */
+ pid = fork();
+ switch (pid) {
+ case -1: err(1, "fork() failed");
+ case 0: break;
+ default:
+ if (daemonize)
+ exit(0);
+ setproctitle("ext init");
+ setup_signal_handler(pid);
+ close_fds();
+ exit_as_status_ext(reap_children());
+ }
+
+ if (daemonize) {
+ if (setsid() == -1)
+ err(1, "setsid() failed");
+
+ int fd = open("/dev/null", O_RDWR);
+ if (fd == -1)
+ err(1, "open(/dev/null) failed");
+ if (dup2(fd, 0) == -1 || dup2(fd, 1) == -1 || dup2(fd, 2) == -1)
+ err(1, "dup2() failed");
+ if (fd > 2)
+ close(fd);
+ }
+
+ /* Set up fresh /proc. */
+ if (mount("none", "/proc", 0, MS_PRIVATE | MS_REC, ""))
+ err(1, "mount(/proc, MS_PRIVATE) failed");
+ if (mount("proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, ""))
+ err(1, "mount(/proc) failed");
+
+ /* Set up internal init process. */
+ pid = fork();
+ switch (pid) {
+ case -1: err(1, "fork() failed");
+ case 0: break;
+ default:
+ setproctitle("int init");
+ setup_signal_handler(pid);
+ close_fds();
+ exit_as_status_int(reap_children());
+ }
+
+ return true;
+}
+
+static void map_uid_gid(uid_t iuid, gid_t igid, uid_t ouid, gid_t ogid)
+{
+ FILE *fp;
+
+ fp = fopen("/proc/self/setgroups", "w");
+ if (fp) {
+ fputs("deny\n", fp);
+ fclose(fp);
+ }
+
+ fp = fopen("/proc/self/uid_map", "w");
+ fprintf(fp, "%u %u 1\n", iuid, ouid);
+ fclose(fp);
+
+ fp = fopen("/proc/self/gid_map", "w");
+ fprintf(fp, "%u %u 1\n", igid, ogid);
+ fclose(fp);
+}
+
+#define a_argument required_argument
+static const struct option opts[] = {
+ { "pid", a_argument, NULL, 1 },
+ { NULL, 0, NULL, 0 },
+};
+
+static void usage(void)
+{
+ puts("Usage: unshare [options] <program>");
+ exit(EX_USAGE);
+}
+
+int main(int argc, char *argv[])
+{
+ int c;
+ FILE *pidfp;
+ const char *pid = NULL;
+ bool newipc = false;
+ bool newmnt = false;
+ bool newnet = false;
+ bool newpid = false;
+ bool newuts = false;
+ bool newusr = false;
+ bool daemonize = false;
+ uid_t uid;
+ gid_t gid;
+
+ title_argv = argv;
+
+ while ((c = getopt_long(argc, argv, "+DimnpuU", opts, NULL)) != -1) {
+ switch (c) {
+ case 1:
+ pid = optarg;
+ break;
+ case 'i': newipc = true; break;
+ case 'm': newmnt = true; break;
+ case 'n': newnet = true; break;
+ case 'p': newpid = true; break;
+ case 'u': newuts = true; break;
+ case 'U': newusr = true; break;
+ case 'D': daemonize = true; break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc == 0)
+ usage();
+
+ if (newusr) {
+ uid = getuid();
+ gid = getgid();
+ if (vunshare(CLONE_NEWUSER))
+ map_uid_gid(0, 0, uid, gid);
+ else
+ newusr = false;
+ }
+
+ if (newmnt || newpid)
+ vunshare(CLONE_NEWNS);
+ if (newuts)
+ vunshare(CLONE_NEWUTS);
+ if (newipc)
+ vunshare(CLONE_NEWIPC);
+ if (newnet)
+ unshare_net();
+
+ if (pid) {
+ pidfp = fopen(pid, "we");
+ if (pidfp == NULL)
+ err(1, "fopen(%s) failed", pid);
+ }
+
+ if (newpid && unshare_pid(daemonize)) {
+ /* Nothing. */
+ } else if (daemonize)
+ if (daemon(1, 0))
+ err(1, "daemon() failed");
+
+ if (pid) {
+ fprintf(pidfp, "%u\n", getpid());
+ fclose(pidfp);
+ }
+
+ if (newusr)
+ if (vunshare(CLONE_NEWUSER))
+ map_uid_gid(uid, gid, 0, 0);
+
+ execvp(argv[0], argv);
+ fprintf(stderr, "%s: %s\n", argv[0], strerror(errno));
+ return 127;
+}