/*
 * Copyright (C) 2015 Canonical Ltd
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
#include "config.h"
#include "mount-support.h"

#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <mntent.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

#include "classic.h"
#include "cleanup-funcs.h"
#include "mount-support-nvidia.h"
#include "quirks.h"
#include "snap.h"
#include "utils.h"

#define MAX_BUF 1000

/*!
 * The void directory.
 *
 * Snap confine moves to that directory in case it cannot retain the current
 * working directory across the pivot_root call.
 **/
#define SC_VOID_DIR "/var/lib/snapd/void"

/**
 * Get the path to the mounted core snap on the host distribution.
 *
 * The core snap may be named just "core" (preferred) or "ubuntu-core"
 * (legacy).  The mount point dependes on build-time configuration and may
 * differ from distribution to distribution.
 **/
static const char *sc_get_outer_core_mount_point()
{
	const char *core_path = SNAP_MOUNT_DIR "/core/current/";
	const char *ubuntu_core_path = SNAP_MOUNT_DIR "/ubuntu-core/current/";
	static const char *result = NULL;
	if (result == NULL) {
		if (access(core_path, F_OK) == 0) {
			// Use the "core" snap if available.
			result = core_path;
		} else if (access(ubuntu_core_path, F_OK) == 0) {
			// If not try to fall back to the "ubuntu-core" snap.
			result = ubuntu_core_path;
		} else {
			die("cannot locate the core snap");
		}
	}
	return result;
}

static void setup_private_mount(const char *security_tag)
{
	uid_t uid = getuid();
	gid_t gid = getgid();
	char tmpdir[MAX_BUF] = { 0 };

	// Create a 0700 base directory, this is the base dir that is
	// protected from other users.
	//
	// Under that basedir, we put a 1777 /tmp dir that is then bind
	// mounted for the applications to use
	must_snprintf(tmpdir, sizeof(tmpdir), "/tmp/snap.%d_%s_XXXXXX", uid,
		      security_tag);
	if (mkdtemp(tmpdir) == NULL) {
		die("unable to create tmpdir");
	}
	// now we create a 1777 /tmp inside our private dir
	mode_t old_mask = umask(0);
	char *d = strdup(tmpdir);
	if (!d) {
		die("Out of memory");
	}
	must_snprintf(tmpdir, sizeof(tmpdir), "%s/tmp", d);
	free(d);

	if (mkdir(tmpdir, 01777) != 0) {
		die("unable to create /tmp inside private dir");
	}
	umask(old_mask);

	// chdir to '/' since the mount won't apply to the current directory
	char *pwd = get_current_dir_name();
	if (pwd == NULL)
		die("unable to get current directory");
	if (chdir("/") != 0)
		die("unable to change directory to '/'");

	// MS_BIND is there from linux 2.4
	if (mount(tmpdir, "/tmp", NULL, MS_BIND, NULL) != 0) {
		die("unable to bind private /tmp");
	}
	// MS_PRIVATE needs linux > 2.6.11
	if (mount("none", "/tmp", NULL, MS_PRIVATE, NULL) != 0) {
		die("unable to make /tmp/ private");
	}
	// do the chown after the bind mount to avoid potential shenanigans
	if (chown("/tmp/", uid, gid) < 0) {
		die("unable to chown tmpdir");
	}
	// chdir to original directory
	if (chdir(pwd) != 0)
		die("unable to change to original directory");
	free(pwd);

	// ensure we set the various TMPDIRs to our newly created tmpdir
	const char *tmpd[] = { "TMPDIR", "TEMPDIR", NULL };
	int i;
	for (i = 0; tmpd[i] != NULL; i++) {
		if (setenv(tmpd[i], "/tmp", 1) != 0) {
			die("unable to set '%s'", tmpd[i]);
		}
	}
}

static void setup_private_pts()
{
	// See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt
	//
	// Ubuntu by default uses devpts 'single-instance' mode where
	// /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change
	// the startup scripts though, so we follow the instructions in point
	// '4' of 'User-space changes' in the above doc. In other words, after
	// unshare(CLONE_NEWNS), we mount devpts with -o
	// newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto
	// /dev/ptmx

	struct stat st;

	// Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode
	// which doesn't provide the isolation we require.
	if (stat("/dev/pts/ptmx", &st) != 0) {
		die("/dev/pts/ptmx does not exist");
	}
	// Make sure /dev/ptmx exists so we can bind mount over it
	if (stat("/dev/ptmx", &st) != 0) {
		die("/dev/ptmx does not exist");
	}
	// Since multi-instance, use ptmxmode=0666. The other options are
	// copied from /etc/default/devpts
	if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
		  "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
		die("unable to mount a new instance of '/dev/pts'");
	}

	if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0)) {
		die("unable to mount '/dev/pts/ptmx'->'/dev/ptmx'");
	}
}

/**
 * The hostfs directory should be added by packaging of snapd but the release
 * with this directory is not universally available so to unblock some other
 * features we can simply create the directory directly.
 **/
static void sc_mkdir_hostfs_if_missing()
{
	if (access(SC_HOSTFS_DIR, F_OK) != 0) {
		debug("creating missing hostfs directory");
		if (mkdir(SC_HOSTFS_DIR, 0755) != 0) {
			die("cannot create %s", SC_HOSTFS_DIR);
		}
	}
}

static void sc_bind_mount_hostfs(const char *rootfs_dir)
{
	// Create a read-only bind mount from "/" to
	// "$rootfs_dir/var/lib/snapd/hostfs".
	char buf[512];
	must_snprintf(buf, sizeof buf, "%s%s", rootfs_dir, SC_HOSTFS_DIR);
	debug("bind-mounting host filesystem at %s", buf);
	if (mount("/", buf, NULL, MS_BIND | MS_RDONLY, NULL) != 0) {
		if (errno == ENOENT) {
			die("cannot bind-mount host filesystem\n"
			    "the core snap is too old, please run: snap refresh ubuntu-core");
		} else {
			die("cannot bind-mount host filesystem at %s", buf);
		}
	}
}

// Bind mount the snap mount directory. Unlike other directories above the
// location of the mount directory on the host filesystem may not match the
// location in the chroot. In the chroot the directory is always /snap. On
// the host it is SNAP_MOUNT_DIR.
static void sc_bind_mount_snap_mount_dir(const char *rootfs_dir)
{
	const char *src = SNAP_MOUNT_DIR;
	char dst[512];
	must_snprintf(dst, sizeof dst, "%s%s", rootfs_dir, SNAP_MOUNT_DIR);
	debug("bind mounting %s to %s", src, dst);
	if (mount(src, dst, NULL, MS_BIND | MS_REC | MS_SLAVE, NULL) != 0) {
		die("cannot bind mount %s to %s", src, dst);
	}
}

// Use pivot_root to "chroot" into a given directory.
//
// Q: Why are we using something as esoteric as pivot_root(2)?
// A: Because this makes apparmor handling easy. Using a normal chroot makes
// all apparmor rules conditional.  We are either running on an all-snap system
// where this would-be chroot didn't happen and all the rules see / as the root
// file system _OR_ we are running on top of a classic distribution and this
// chroot has now moved all paths to /tmp/snap.rootfs_*.
//
// Because we are using unshare(2) with CLONE_NEWNS we can essentially use
// pivot_root just like chroot but this makes apparmor unaware of the old root
// so everything works okay.
static void sc_pivot_to_new_rootfs(const char *rootfs_dir)
{
	int old_rootfs_fd, new_rootfs_fd;

	old_rootfs_fd = open("/", O_DIRECTORY | O_PATH | O_CLOEXEC);
	if (old_rootfs_fd == -1) {
		die("cannot open old root file system directory");
	}
	new_rootfs_fd =
	    open(rootfs_dir, O_DIRECTORY | O_PATH | O_NOFOLLOW | O_CLOEXEC);
	if (new_rootfs_fd == -1) {
		die("cannot open new root file system directory");
	}
	if (fchdir(new_rootfs_fd) == -1) {
		die("cannot move to new root file system directory");
	}
	debug("using pivot_root to move into %s", rootfs_dir);
	if (syscall(SYS_pivot_root, ".", ".") == -1) {
		die("cannot pivot_root to the new root filesystem");
	}
	if (fchdir(old_rootfs_fd) == -1) {
		die("cannot move to the old root file system directory");
	}
	if (umount2(".", MNT_DETACH) == -1) {
		die("cannot detach old root file system directory");
	}
	if (fchdir(new_rootfs_fd) == -1) {
		die("cannot move to the new root file system directory");
	}
	close(old_rootfs_fd);
	close(new_rootfs_fd);
}

static void setup_snappy_os_mounts()
{
	debug("%s", __func__);
	char rootfs_dir[MAX_BUF] = { 0 };
// Create a temporary directory that will become the root directory of this
	// process later on. The directory will be used as a mount point for the
	// core snap.
	//
	// XXX: This directory is never cleaned up today.
	must_snprintf(rootfs_dir, sizeof(rootfs_dir),
		      "/tmp/snap.rootfs_XXXXXX");
	if (mkdtemp(rootfs_dir) == NULL) {
		die("cannot create temporary directory for the root file system");
	}
	// Bind mount the OS snap into the rootfs directory.
	const char *core_snap_dir = sc_get_outer_core_mount_point();
	debug("bind mounting core snap: %s -> %s", core_snap_dir, rootfs_dir);
	if (mount(core_snap_dir, rootfs_dir, NULL, MS_BIND, NULL) != 0) {
		die("cannot bind mount core snap: %s to %s", core_snap_dir,
		    rootfs_dir);
	}
	// Bind mount certain directories from the rootfs directory (with the core
	// snap) to various places on the host OS. Each directory is justified with
	// a short comment below.
	const char *source_mounts[] = {
		"/dev",		// because it contains devices on host OS
		"/etc",		// because that's where /etc/resolv.conf lives, perhaps a bad idea
		"/home",	// to support /home/*/snap and home interface
		"/root",	// because that is $HOME for services
		"/proc",	// fundamental filesystem
		"/sys",		// fundamental filesystem
		"/tmp",		// to get writable tmp
		"/var/snap",	// to get access to global snap data
		"/var/lib/snapd",	// to get access to snapd state and seccomp profiles
		"/var/tmp",	// to get access to the other temporary directory
		"/run",		// to get /run with sockets and what not
#ifdef MERGED_USR
#else
		"/media",	// access to the users removable devices
#endif				// MERGED_USR
		"/lib/modules",	// access to the modules of the running kernel
		"/usr/src",	// FIXME: move to SecurityMounts in system-trace interface
		"/var/log",	// FIXME: move to SecurityMounts in log-observe interface
	};
	for (int i = 0; i < sizeof(source_mounts) / sizeof *source_mounts; i++) {
		const char *src = source_mounts[i];
		char dst[512];
		must_snprintf(dst, sizeof dst, "%s%s", rootfs_dir,
			      source_mounts[i]);
		debug("bind mounting %s to %s", src, dst);
		// NOTE: MS_REC so that we can see anything that may be mounted under
		// any of the directories already. This is crucial for /snap, for
		// example.
		//
		// NOTE: MS_SLAVE so that the started process cannot maliciously mount
		// anything into those places and affect the system on the outside.
		if (mount(src, dst, NULL, MS_BIND | MS_REC | MS_SLAVE, NULL) !=
		    0) {
			die("cannot bind mount %s to %s", src, dst);
		}
	}
	sc_bind_mount_snap_mount_dir(rootfs_dir);
	// Since we mounted /etc from the host above, we need to put
	// /etc/alternatives from the os snap back.
	// https://bugs.launchpad.net/snap-confine/+bug/1580018
	const char *etc_alternatives = "/etc/alternatives";
	if (access(etc_alternatives, F_OK) == 0) {
		char src[512];
		char dst[512];
		must_snprintf(src, sizeof src, "%s%s", core_snap_dir,
			      etc_alternatives);
		must_snprintf(dst, sizeof dst, "%s%s", rootfs_dir,
			      etc_alternatives);
		debug("bind mounting %s to %s", src, dst);
		// NOTE: MS_SLAVE so that the started process cannot maliciously mount
		// anything into those places and affect the system on the outside.
		if (mount(src, dst, NULL, MS_BIND | MS_SLAVE, NULL) != 0) {
			die("cannot bind mount %s to %s", src, dst);
		}
	}
	sc_mkdir_hostfs_if_missing();
	sc_bind_mount_hostfs(rootfs_dir);
	sc_mount_nvidia_driver(rootfs_dir);
	sc_pivot_to_new_rootfs(rootfs_dir);
}

/**
 * Setup mount profiles as described by snapd.
 *
 * This function reads /var/lib/snapd/mount/$security_tag.fstab as a fstab(5) file
 * and executes the mount requests described there.
 *
 * Currently only bind mounts are allowed. All bind mounts are read only by
 * default though the `rw` flag can be used.
 *
 * This function is called with the rootfs being "consistent" so that it is
 * either the core snap on an all-snap system or the core snap + punched holes
 * on a classic system.
 **/
static void sc_setup_mount_profiles(const char *security_tag)
{
	debug("%s: %s", __FUNCTION__, security_tag);

	FILE *f __attribute__ ((cleanup(sc_cleanup_endmntent))) = NULL;
	const char *mount_profile_dir = "/var/lib/snapd/mount";

	char profile_path[PATH_MAX];
	must_snprintf(profile_path, sizeof(profile_path), "%s/%s.fstab",
		      mount_profile_dir, security_tag);

	debug("opening mount profile %s", profile_path);
	f = setmntent(profile_path, "r");
	// it is ok for the file to not exist
	if (f == NULL && errno == ENOENT) {
		debug("mount profile %s doesn't exist, ignoring", profile_path);
		return;
	}
	// however any other error is a real error
	if (f == NULL) {
		die("cannot open %s", profile_path);
	}

	struct mntent *m = NULL;
	while ((m = getmntent(f)) != NULL) {
		debug("read mount entry\n"
		      "\tmnt_fsname: %s\n"
		      "\tmnt_dir: %s\n"
		      "\tmnt_type: %s\n"
		      "\tmnt_opts: %s\n"
		      "\tmnt_freq: %d\n"
		      "\tmnt_passno: %d",
		      m->mnt_fsname, m->mnt_dir, m->mnt_type,
		      m->mnt_opts, m->mnt_freq, m->mnt_passno);
		int flags = MS_BIND | MS_RDONLY | MS_NODEV | MS_NOSUID;
		debug("initial flags are: bind,ro,nodev,nosuid");
		if (strcmp(m->mnt_type, "none") != 0) {
			die("only 'none' filesystem type is supported");
		}
		if (hasmntopt(m, "bind") == NULL) {
			die("the bind mount flag is mandatory");
		}
		if (hasmntopt(m, "rw") != NULL) {
			flags &= ~MS_RDONLY;
		}
		if (mount(m->mnt_fsname, m->mnt_dir, NULL, flags, NULL) != 0) {
			die("cannot mount %s at %s with options %s",
			    m->mnt_fsname, m->mnt_dir, m->mnt_opts);
		}
	}
}

/**
 * @path:    a pathname where / replaced with '\0'.
 * @offsetp: pointer to int showing which path segment was last seen.
 *           Updated on return to reflect the next segment.
 * @fulllen: full original path length.
 * Returns a pointer to the next path segment, or NULL if done.
 */
static char * __attribute__ ((used))
    get_nextpath(char *path, size_t * offsetp, size_t fulllen)
{
	int offset = *offsetp;

	if (offset >= fulllen)
		return NULL;

	while (offset < fulllen && path[offset] != '\0')
		offset++;
	while (offset < fulllen && path[offset] == '\0')
		offset++;

	*offsetp = offset;
	return (offset < fulllen) ? &path[offset] : NULL;
}

/**
 * Check that @subdir is a subdir of @dir.
**/
static bool __attribute__ ((used))
    is_subdir(const char *subdir, const char *dir)
{
	size_t dirlen = strlen(dir);
	size_t subdirlen = strlen(subdir);

	// @dir has to be at least as long as @subdir
	if (subdirlen < dirlen)
		return false;
	// @dir has to be a prefix of @subdir
	if (strncmp(subdir, dir, dirlen) != 0)
		return false;
	// @dir can look like "path/" (that is, end with the directory separator).
	// When that is the case then given the test above we can be sure @subdir
	// is a real subdirectory.
	if (dirlen > 0 && dir[dirlen - 1] == '/')
		return true;
	// @subdir can look like "path/stuff" and when the directory separator
	// is exactly at the spot where @dir ends (that is, it was not caught
	// by the test above) then @subdir is a real subdirectory.
	if (subdir[dirlen] == '/' && dirlen > 0)
		return true;
	// If both @dir and @subdir have identical length then given that the
	// prefix check above @subdir is a real subdirectory.
	if (subdirlen == dirlen)
		return true;
	return false;
}

void sc_populate_mount_ns(const char *security_tag)
{
	// Get the current working directory before we start fiddling with
	// mounts and possibly pivot_root.  At the end of the whole process, we
	// will try to re-locate to the same directory (if possible).
	char *vanilla_cwd __attribute__ ((cleanup(sc_cleanup_string))) = NULL;
	vanilla_cwd = get_current_dir_name();
	if (vanilla_cwd == NULL) {
		die("cannot get the current working directory");
	}
	// Make our "/" a rslave of the real "/". This means that mounts from the
	// host "/" get propagated to our namespace (i.e. we see new media mounts).
	if (mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL) != 0) {
		die("can not make make / rslave");
	}
	bool on_classic = is_running_on_classic_distribution();
	// do the mounting if run on a non-native snappy system
	if (on_classic) {
		setup_snappy_os_mounts();
	}
	// set up private mounts
	setup_private_mount(security_tag);

	// set up private /dev/pts
	setup_private_pts();

	// setup quirks for specific snaps
	if (on_classic) {
		sc_setup_quirks();
	}
	// setup the security backend bind mounts
	sc_setup_mount_profiles(security_tag);

	// Try to re-locate back to vanilla working directory. This can fail
	// because that directory is no longer present.
	if (chdir(vanilla_cwd) != 0) {
		debug("cannot remain in %s, moving to the void directory",
		      vanilla_cwd);
		if (chdir(SC_VOID_DIR) != 0) {
			die("cannot change directory to %s", SC_VOID_DIR);
		}
		debug("successfully moved to %s", SC_VOID_DIR);
	}
}
