security/sandbox/linux/SandboxFilter.cpp
author Jed Davis <jld@mozilla.com>
Wed, 25 Oct 2017 12:43:13 -0600
changeset 688059 f19339207df8aefb66b334de7eb4686fad820504
parent 687660 ae49d4a5762264ded3aae4006baddc2203b79b94
child 688060 877ce6f1fa91c454cd2306e3a8a882d3617e9215
permissions -rw-r--r--
Bug 1409900 - Disallow quotactl in sandboxed content processes. r=gcp MozReview-Commit-ID: 3svUgLLTZKL

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "SandboxFilter.h"
#include "SandboxFilterUtil.h"

#include "SandboxBrokerClient.h"
#include "SandboxInfo.h"
#include "SandboxInternal.h"
#include "SandboxLogging.h"
#ifdef MOZ_GMP_SANDBOX
#include "SandboxOpenedFiles.h"
#endif
#include "mozilla/PodOperations.h"
#include "mozilla/UniquePtr.h"

#include <errno.h>
#include <fcntl.h>
#include <linux/ioctl.h>
#include <linux/ipc.h>
#include <linux/net.h>
#include <linux/prctl.h>
#include <linux/sched.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/utsname.h>
#include <time.h>
#include <unistd.h>
#include <vector>
#include <algorithm>

#include "sandbox/linux/bpf_dsl/bpf_dsl.h"
#include "sandbox/linux/system_headers/linux_seccomp.h"
#include "sandbox/linux/system_headers/linux_syscalls.h"

using namespace sandbox::bpf_dsl;
#define CASES SANDBOX_BPF_DSL_CASES

// Fill in defines in case of old headers.
// (Warning: these are wrong on PA-RISC.)
#ifndef MADV_HUGEPAGE
#define MADV_HUGEPAGE 14
#endif
#ifndef MADV_NOHUGEPAGE
#define MADV_NOHUGEPAGE 15
#endif
#ifndef MADV_DONTDUMP
#define MADV_DONTDUMP 16
#endif

// Added in Linux 4.5; see bug 1303813.
#ifndef MADV_FREE
#define MADV_FREE 8
#endif

#ifndef PR_SET_PTRACER
#define PR_SET_PTRACER 0x59616d61
#endif

// The headers define O_LARGEFILE as 0 on x86_64, but we need the
// actual value because it shows up in file flags.
#define O_LARGEFILE_REAL 00100000

// To avoid visual confusion between "ifdef ANDROID" and "ifndef ANDROID":
#ifndef ANDROID
#define DESKTOP
#endif

// This file defines the seccomp-bpf system call filter policies.
// See also SandboxFilterUtil.h, for the CASES_FOR_* macros and
// SandboxFilterBase::Evaluate{Socket,Ipc}Call.
//
// One important difference from how Chromium bpf_dsl filters are
// normally interpreted: returning -ENOSYS from a Trap() handler
// indicates an unexpected system call; SigSysHandler() in Sandbox.cpp
// will detect this, request a crash dump, and terminate the process.
// This does not apply to using Error(ENOSYS) in the policy, so that
// can be used if returning an actual ENOSYS is needed.

namespace mozilla {

// This class whitelists everything used by the sandbox itself, by the
// core IPC code, by the crash reporter, or other core code.
class SandboxPolicyCommon : public SandboxPolicyBase
{
protected:
  typedef const sandbox::arch_seccomp_data& ArgsRef;

  static intptr_t BlockedSyscallTrap(ArgsRef aArgs, void *aux) {
    MOZ_ASSERT(!aux);
    return -ENOSYS;
  }

  // Convert Unix-style "return -1 and set errno" APIs back into the
  // Linux ABI "return -err" style.
  static intptr_t ConvertError(long rv) {
    return rv < 0 ? -errno : rv;
  }

  template<typename... Args>
  static intptr_t DoSyscall(long nr, Args... args) {
    return ConvertError(syscall(nr, args...));
  }

private:
  // Bug 1093893: Translate tkill to tgkill for pthread_kill; fixed in
  // bionic commit 10c8ce59a (in JB and up; API level 16 = Android 4.1).
  // Bug 1376653: musl also needs this, and security-wise it's harmless.
  static intptr_t TKillCompatTrap(const sandbox::arch_seccomp_data& aArgs,
                                  void *aux)
  {
    return DoSyscall(__NR_tgkill, getpid(), aArgs.args[0], aArgs.args[1]);
  }

  static intptr_t SetNoNewPrivsTrap(ArgsRef& aArgs, void* aux) {
    if (gSetSandboxFilter == nullptr) {
      // Called after BroadcastSetThreadSandbox finished, therefore
      // not our doing and not expected.
      return BlockedSyscallTrap(aArgs, nullptr);
    }
    // Signal that the filter is already in place.
    return -ETXTBSY;
  }

public:
  ResultExpr InvalidSyscall() const override {
    return Trap(BlockedSyscallTrap, nullptr);
  }

  virtual ResultExpr ClonePolicy(ResultExpr failPolicy) const {
    // Allow use for simple thread creation (pthread_create) only.

    // WARNING: s390 and cris pass the flags in the second arg -- see
    // CLONE_BACKWARDS2 in arch/Kconfig in the kernel source -- but we
    // don't support seccomp-bpf on those archs yet.
    Arg<int> flags(0);

    // The exact flags used can vary.  CLONE_DETACHED is used by musl
    // and by old versions of Android (<= JB 4.2), but it's been
    // ignored by the kernel since the beginning of the Git history.
    //
    // If we ever need to support Android <= KK 4.4 again, SETTLS
    // and the *TID flags will need to be made optional.
    static const int flags_required = CLONE_VM | CLONE_FS | CLONE_FILES |
      CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | CLONE_SETTLS |
      CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
    static const int flags_optional = CLONE_DETACHED;

    return If((flags & ~flags_optional) == flags_required, Allow())
      .Else(failPolicy);
  }

  virtual ResultExpr PrctlPolicy() const {
    // Note: this will probably need PR_SET_VMA if/when it's used on
    // Android without being overridden by an allow-all policy, and
    // the constant will need to be defined locally.
    Arg<int> op(0);
    return Switch(op)
      .CASES((PR_GET_SECCOMP, // BroadcastSetThreadSandbox, etc.
              PR_SET_NAME,    // Thread creation
              PR_SET_DUMPABLE, // Crash reporting
              PR_SET_PTRACER), // Debug-mode crash handling
             Allow())
      .Default(InvalidSyscall());
  }

  Maybe<ResultExpr> EvaluateSocketCall(int aCall) const override {
    switch (aCall) {
    case SYS_RECVMSG:
    case SYS_SENDMSG:
      return Some(Allow());
    default:
      return Nothing();
    }
  }

  ResultExpr EvaluateSyscall(int sysno) const override {
    switch (sysno) {
      // Timekeeping
    case __NR_clock_gettime: {
      Arg<clockid_t> clk_id(0);
      return If(clk_id == CLOCK_MONOTONIC, Allow())
#ifdef CLOCK_MONOTONIC_COARSE
        // Used by SandboxReporter, among other things.
        .ElseIf(clk_id == CLOCK_MONOTONIC_COARSE, Allow())
#endif
        .ElseIf(clk_id == CLOCK_PROCESS_CPUTIME_ID, Allow())
        .ElseIf(clk_id == CLOCK_REALTIME, Allow())
#ifdef CLOCK_REALTIME_COARSE
        .ElseIf(clk_id == CLOCK_REALTIME_COARSE, Allow())
#endif
        .ElseIf(clk_id == CLOCK_THREAD_CPUTIME_ID, Allow())
        .Else(InvalidSyscall());
    }
    case __NR_gettimeofday:
#ifdef __NR_time
    case __NR_time:
#endif
    case __NR_nanosleep:
      return Allow();

      // Thread synchronization
    case __NR_futex:
      // FIXME: This could be more restrictive....
      return Allow();

      // Asynchronous I/O
    case __NR_epoll_create1:
    case __NR_epoll_create:
    case __NR_epoll_wait:
    case __NR_epoll_pwait:
    case __NR_epoll_ctl:
    case __NR_ppoll:
    case __NR_poll:
      return Allow();

      // Used when requesting a crash dump.
    case __NR_pipe:
      return Allow();

      // Metadata of opened files
    CASES_FOR_fstat:
      return Allow();

      // Simple I/O
    case __NR_write:
    case __NR_read:
    case __NR_readv:
    case __NR_writev: // see SandboxLogging.cpp
    CASES_FOR_lseek:
      return Allow();

      // Memory mapping
    CASES_FOR_mmap:
    case __NR_munmap:
      return Allow();

      // Signal handling
#if defined(ANDROID) || defined(MOZ_ASAN)
    case __NR_sigaltstack:
#endif
    CASES_FOR_sigreturn:
    CASES_FOR_sigprocmask:
    CASES_FOR_sigaction:
      return Allow();

      // Send signals within the process (raise(), profiling, etc.)
    case __NR_tgkill: {
      Arg<pid_t> tgid(0);
      return If(tgid == getpid(), Allow())
        .Else(InvalidSyscall());
    }

      // Polyfill with tgkill; see above.
    case __NR_tkill:
      return Trap(TKillCompatTrap, nullptr);

      // Yield
    case __NR_sched_yield:
      return Allow();

      // Thread creation.
    case __NR_clone:
      return ClonePolicy(InvalidSyscall());

      // More thread creation.
#ifdef __NR_set_robust_list
    case __NR_set_robust_list:
      return Allow();
#endif
#ifdef ANDROID
    case __NR_set_tid_address:
      return Allow();
#endif

      // prctl
    case __NR_prctl: {
      if (SandboxInfo::Get().Test(SandboxInfo::kHasSeccompTSync)) {
        return PrctlPolicy();
      }

      Arg<int> option(0);
      return If(option == PR_SET_NO_NEW_PRIVS,
                Trap(SetNoNewPrivsTrap, nullptr))
        .Else(PrctlPolicy());
    }

      // NSPR can call this when creating a thread, but it will accept a
      // polite "no".
    case __NR_getpriority:
      // But if thread creation races with sandbox startup, that call
      // could succeed, and then we get one of these:
    case __NR_setpriority:
      return Error(EACCES);

      // Stack bounds are obtained via pthread_getattr_np, which calls
      // this but doesn't actually need it:
    case __NR_sched_getaffinity:
      return Error(ENOSYS);

      // Read own pid/tid.
    case __NR_getpid:
    case __NR_gettid:
      return Allow();

      // Discard capabilities
    case __NR_close:
      return Allow();

      // Machine-dependent stuff
#ifdef __arm__
    case __ARM_NR_breakpoint:
    case __ARM_NR_cacheflush:
    case __ARM_NR_usr26: // FIXME: do we actually need this?
    case __ARM_NR_usr32:
    case __ARM_NR_set_tls:
      return Allow();
#endif

      // Needed when being debugged:
    case __NR_restart_syscall:
      return Allow();

      // Terminate threads or the process
    case __NR_exit:
    case __NR_exit_group:
      return Allow();

#ifdef MOZ_ASAN
      // ASAN's error reporter wants to know if stderr is a tty.
    case __NR_ioctl: {
      Arg<int> fd(0);
      return If(fd == STDERR_FILENO, Error(ENOTTY))
        .Else(InvalidSyscall());
    }

      // ...and before compiler-rt r209773, it will call readlink on
      // /proc/self/exe and use the cached value only if that fails:
    case __NR_readlink:
    case __NR_readlinkat:
      return Error(ENOENT);

      // ...and if it found an external symbolizer, it will try to run it:
      // (See also bug 1081242 comment #7.)
    CASES_FOR_stat:
      return Error(ENOENT);
#endif

    default:
      return SandboxPolicyBase::EvaluateSyscall(sysno);
    }
  }
};

// The process-type-specific syscall rules start here:

#ifdef MOZ_CONTENT_SANDBOX
// The seccomp-bpf filter for content processes is not a true sandbox
// on its own; its purpose is attack surface reduction and syscall
// interception in support of a semantic sandboxing layer.  On B2G
// this is the Android process permission model; on desktop,
// namespaces and chroot() will be used.
class ContentSandboxPolicy : public SandboxPolicyCommon {
private:
  SandboxBrokerClient* mBroker;
  std::vector<int> mSyscallWhitelist;

  // Trap handlers for filesystem brokering.
  // (The amount of code duplication here could be improved....)
#ifdef __NR_open
  static intptr_t OpenTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto flags = static_cast<int>(aArgs.args[1]);
    return broker->Open(path, flags);
  }
#endif

  static intptr_t OpenAtTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto fd = static_cast<int>(aArgs.args[0]);
    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    auto flags = static_cast<int>(aArgs.args[2]);
    if (fd != AT_FDCWD && path[0] != '/') {
      SANDBOX_LOG_ERROR("unsupported fd-relative openat(%d, \"%s\", 0%o)",
                        fd, path, flags);
      return BlockedSyscallTrap(aArgs, nullptr);
    }
    return broker->Open(path, flags);
  }

#ifdef __NR_access
  static intptr_t AccessTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto mode = static_cast<int>(aArgs.args[1]);
    return broker->Access(path, mode);
  }
#endif

  static intptr_t AccessAtTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto fd = static_cast<int>(aArgs.args[0]);
    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    auto mode = static_cast<int>(aArgs.args[2]);
    // Linux's faccessat syscall has no "flags" argument.  Attempting
    // to handle the flags != 0 case is left to userspace; this is
    // impossible to do correctly in all cases, but that's not our
    // problem.
    if (fd != AT_FDCWD && path[0] != '/') {
      SANDBOX_LOG_ERROR("unsupported fd-relative faccessat(%d, \"%s\", %d)",
                        fd, path, mode);
      return BlockedSyscallTrap(aArgs, nullptr);
    }
    return broker->Access(path, mode);
  }

  static intptr_t StatTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto buf = reinterpret_cast<statstruct*>(aArgs.args[1]);
    return broker->Stat(path, buf);
  }

  static intptr_t LStatTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto buf = reinterpret_cast<statstruct*>(aArgs.args[1]);
    return broker->LStat(path, buf);
  }

  static intptr_t StatAtTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto fd = static_cast<int>(aArgs.args[0]);
    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    auto buf = reinterpret_cast<statstruct*>(aArgs.args[2]);
    auto flags = static_cast<int>(aArgs.args[3]);
    if (fd != AT_FDCWD && path[0] != '/') {
      SANDBOX_LOG_ERROR("unsupported fd-relative fstatat(%d, \"%s\", %p, %d)",
                        fd, path, buf, flags);
      return BlockedSyscallTrap(aArgs, nullptr);
    }
    if ((flags & ~AT_SYMLINK_NOFOLLOW) != 0) {
      SANDBOX_LOG_ERROR("unsupported flags %d in fstatat(%d, \"%s\", %p, %d)",
                        (flags & ~AT_SYMLINK_NOFOLLOW), fd, path, buf, flags);
      return BlockedSyscallTrap(aArgs, nullptr);
    }
    return (flags & AT_SYMLINK_NOFOLLOW) == 0
      ? broker->Stat(path, buf)
      : broker->LStat(path, buf);
  }

  static intptr_t ChmodTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto mode = static_cast<mode_t>(aArgs.args[1]);
    return broker->Chmod(path, mode);
  }

  static intptr_t LinkTrap(ArgsRef aArgs, void *aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto path2 = reinterpret_cast<const char*>(aArgs.args[1]);
    return broker->Link(path, path2);
  }

  static intptr_t SymlinkTrap(ArgsRef aArgs, void *aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto path2 = reinterpret_cast<const char*>(aArgs.args[1]);
    return broker->Symlink(path, path2);
  }

  static intptr_t RenameTrap(ArgsRef aArgs, void *aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto path2 = reinterpret_cast<const char*>(aArgs.args[1]);
    return broker->Rename(path, path2);
  }

  static intptr_t MkdirTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto mode = static_cast<mode_t>(aArgs.args[1]);
    return broker->Mkdir(path, mode);
  }

  static intptr_t RmdirTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    return broker->Rmdir(path);
  }

  static intptr_t UnlinkTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    return broker->Unlink(path);
  }

  static intptr_t ReadlinkTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto path = reinterpret_cast<const char*>(aArgs.args[0]);
    auto buf = reinterpret_cast<char*>(aArgs.args[1]);
    auto size = static_cast<size_t>(aArgs.args[2]);
    return broker->Readlink(path, buf, size);
  }

  static intptr_t ReadlinkAtTrap(ArgsRef aArgs, void* aux) {
    auto broker = static_cast<SandboxBrokerClient*>(aux);
    auto fd = static_cast<int>(aArgs.args[0]);
    auto path = reinterpret_cast<const char*>(aArgs.args[1]);
    auto buf = reinterpret_cast<char*>(aArgs.args[2]);
    auto size = static_cast<size_t>(aArgs.args[3]);
    if (fd != AT_FDCWD && path[0] != '/') {
      SANDBOX_LOG_ERROR("unsupported fd-relative readlinkat(%d, %s, %p, %u)",
                        fd, path, buf, size);
      return BlockedSyscallTrap(aArgs, nullptr);
    }
    return broker->Readlink(path, buf, size);
  }

  static intptr_t GetPPidTrap(ArgsRef aArgs, void* aux) {
    // In a pid namespace, getppid() will return 0. We will return 0 instead
    // of the real parent pid to see what breaks when we introduce the
    // pid namespace (Bug 1151624).
    return 0;
  }

  static intptr_t SocketpairDatagramTrap(ArgsRef aArgs, void* aux) {
    auto fds = reinterpret_cast<int*>(aArgs.args[3]);
    // Return sequential packet sockets instead of the expected
    // datagram sockets; see bug 1355274 for details.
    return ConvertError(socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds));
  }

public:
  explicit ContentSandboxPolicy(SandboxBrokerClient* aBroker,
                                const std::vector<int>& aSyscallWhitelist)
    : mBroker(aBroker),
      mSyscallWhitelist(aSyscallWhitelist) {}
  ~ContentSandboxPolicy() override = default;
  Maybe<ResultExpr> EvaluateSocketCall(int aCall) const override {
    switch(aCall) {
    case SYS_RECVFROM:
    case SYS_SENDTO:
    case SYS_SENDMMSG: // libresolv via libasyncns; see bug 1355274
      return Some(Allow());

    case SYS_SOCKETPAIR: {
      // See bug 1066750.
      if (!kSocketCallHasArgs) {
        // We can't filter the args if the platform passes them by pointer.
        return Some(Allow());
      }
      Arg<int> domain(0), type(1);
      return Some(If(domain == AF_UNIX,
                     Switch(type & ~SOCK_CLOEXEC)
                     .Case(SOCK_STREAM, Allow())
                     .Case(SOCK_SEQPACKET, Allow())
                     .Case(SOCK_DGRAM, Trap(SocketpairDatagramTrap, nullptr))
                     .Default(InvalidSyscall()))
                  .Else(InvalidSyscall()));
    }

#ifdef ANDROID
    case SYS_SOCKET:
      return Some(Error(EACCES));
#else // #ifdef DESKTOP
    case SYS_RECV:
    case SYS_SEND:
    case SYS_SOCKET: // DANGEROUS
    case SYS_CONNECT: // DANGEROUS
    case SYS_GETSOCKOPT:
    case SYS_SETSOCKOPT:
    case SYS_GETSOCKNAME:
    case SYS_GETPEERNAME:
    case SYS_SHUTDOWN:
      return Some(Allow());
#endif
    default:
      return SandboxPolicyCommon::EvaluateSocketCall(aCall);
    }
  }

#ifdef DESKTOP
  Maybe<ResultExpr> EvaluateIpcCall(int aCall) const override {
    switch(aCall) {
      // These are a problem: SysV shared memory follows the Unix
      // "same uid policy" and can't be restricted/brokered like file
      // access.  But the graphics layer might not be using them
      // anymore; this needs to be studied.
    case SHMGET:
    case SHMCTL:
    case SHMAT:
    case SHMDT:
    case SEMGET:
    case SEMCTL:
    case SEMOP:
    case MSGGET:
      return Some(Allow());
    default:
      return SandboxPolicyCommon::EvaluateIpcCall(aCall);
    }
  }
#endif

#ifdef MOZ_PULSEAUDIO
  ResultExpr PrctlPolicy() const override {
    Arg<int> op(0);
    return If(op == PR_GET_NAME, Allow())
      .Else(SandboxPolicyCommon::PrctlPolicy());
  }
#endif

  ResultExpr EvaluateSyscall(int sysno) const override {
    // Straight allow for anything that got overriden via prefs
    if (std::find(mSyscallWhitelist.begin(), mSyscallWhitelist.end(), sysno)
        != mSyscallWhitelist.end()) {
      if (SandboxInfo::Get().Test(SandboxInfo::kVerbose)) {
        SANDBOX_LOG_ERROR("Allowing syscall nr %d via whitelist", sysno);
      }
      return Allow();
    }
    if (mBroker) {
      // Have broker; route the appropriate syscalls to it.
      switch (sysno) {
      case __NR_open:
        return Trap(OpenTrap, mBroker);
      case __NR_openat:
        return Trap(OpenAtTrap, mBroker);
      case __NR_access:
        return Trap(AccessTrap, mBroker);
      case __NR_faccessat:
        return Trap(AccessAtTrap, mBroker);
      CASES_FOR_stat:
        return Trap(StatTrap, mBroker);
      CASES_FOR_lstat:
        return Trap(LStatTrap, mBroker);
      CASES_FOR_fstatat:
        return Trap(StatAtTrap, mBroker);
      case __NR_chmod:
        return Trap(ChmodTrap, mBroker);
      case __NR_link:
        return Trap(LinkTrap, mBroker);
      case __NR_mkdir:
        return Trap(MkdirTrap, mBroker);
      case __NR_symlink:
        return Trap(SymlinkTrap, mBroker);
      case __NR_rename:
        return Trap(RenameTrap, mBroker);
      case __NR_rmdir:
        return Trap(RmdirTrap, mBroker);
      case __NR_unlink:
        return Trap(UnlinkTrap, mBroker);
      case __NR_readlink:
        return Trap(ReadlinkTrap, mBroker);
      case __NR_readlinkat:
        return Trap(ReadlinkAtTrap, mBroker);
      }
    } else {
      // No broker; allow the syscalls directly.  )-:
      switch(sysno) {
      case __NR_open:
      case __NR_openat:
      case __NR_access:
      case __NR_faccessat:
      CASES_FOR_stat:
      CASES_FOR_lstat:
      CASES_FOR_fstatat:
      case __NR_chmod:
      case __NR_link:
      case __NR_mkdir:
      case __NR_symlink:
      case __NR_rename:
      case __NR_rmdir:
      case __NR_unlink:
      case __NR_readlink:
      case __NR_readlinkat:
        return Allow();
      }
    }

    switch (sysno) {
#ifdef DESKTOP
    case __NR_getppid:
      return Trap(GetPPidTrap, nullptr);

      // Filesystem syscalls that need more work to determine who's
      // using them, if they need to be, and what we intend to about it.
    case __NR_getcwd:
    CASES_FOR_statfs:
    CASES_FOR_fstatfs:
    CASES_FOR_fchown:
    case __NR_fchmod:
    case __NR_flock:
      return Allow();

      // Bug 1354731: proprietary GL drivers try to mknod() their devices
    case __NR_mknod: {
      Arg<mode_t> mode(1);
      return If((mode & S_IFMT) == S_IFCHR, Error(EPERM))
        .Else(InvalidSyscall());
    }

      // For ORBit called by GConf (on some systems) to get proxy
      // settings.  Can remove when bug 1325242 happens in some form.
    case __NR_utime:
      return Error(EPERM);
#endif

    CASES_FOR_select:
    case __NR_pselect6:
      return Allow();

    CASES_FOR_getdents:
    CASES_FOR_ftruncate:
    case __NR_writev:
    case __NR_pread64:
#ifdef DESKTOP
    case __NR_pwrite64:
    case __NR_readahead:
#endif
      return Allow();

#ifdef MOZ_ALSA
    case __NR_ioctl:
      return Allow();
#else
    case __NR_ioctl: {
      static const unsigned long kTypeMask = _IOC_TYPEMASK << _IOC_TYPESHIFT;
      static const unsigned long kTtyIoctls = TIOCSTI & kTypeMask;
      // On some older architectures (but not x86 or ARM), ioctls are
      // assigned type fields differently, and the TIOC/TC/FIO group
      // isn't all the same type.  If/when we support those archs,
      // this would need to be revised (but really this should be a
      // default-deny policy; see below).
      static_assert(kTtyIoctls == (TCSETA & kTypeMask) &&
                    kTtyIoctls == (FIOASYNC & kTypeMask),
                    "tty-related ioctls use the same type");

      Arg<unsigned long> request(1);
      auto shifted_type = request & kTypeMask;

      // Rust's stdlib seems to use FIOCLEX instead of equivalent fcntls.
      return If(request == FIOCLEX, Allow())
        // Rust's stdlib also uses FIONBIO instead of equivalent fcntls.
        .ElseIf(request == FIONBIO, Allow())
        // ffmpeg, and anything else that calls isatty(), will be told
        // that nothing is a typewriter:
        .ElseIf(request == TCGETS, Error(ENOTTY))
        // Allow anything that isn't a tty ioctl, for now; bug 1302711
        // will cover changing this to a default-deny policy.
        .ElseIf(shifted_type != kTtyIoctls, Allow())
        .Else(SandboxPolicyCommon::EvaluateSyscall(sysno));
    }
#endif // !MOZ_ALSA

    CASES_FOR_fcntl: {
      Arg<int> cmd(1);
      Arg<int> flags(2);
      // Typical use of F_SETFL is to modify the flags returned by
      // F_GETFL and write them back, including some flags that
      // F_SETFL ignores.  This is a default-deny policy in case any
      // new SETFL-able flags are added.  (In particular we want to
      // forbid O_ASYNC; see bug 1328896, but also see bug 1408438.)
      static const int ignored_flags = O_ACCMODE | O_LARGEFILE_REAL | O_CLOEXEC;
      static const int allowed_flags = ignored_flags | O_APPEND | O_NONBLOCK;
      return Switch(cmd)
        // Close-on-exec is meaningless when execve isn't allowed, but
        // NSPR reads the bit and asserts that it has the expected value.
        .Case(F_GETFD, Allow())
        .Case(F_SETFD,
              If((flags & ~FD_CLOEXEC) == 0, Allow())
              .Else(InvalidSyscall()))
        .Case(F_GETFL, Allow())
        .Case(F_SETFL,
              If((flags & ~allowed_flags) == 0, Allow())
              .Else(InvalidSyscall()))
        .Case(F_DUPFD_CLOEXEC, Allow())
        // Pulseaudio uses F_SETLKW.
        .Case(F_SETLKW, Allow())
#ifdef F_SETLKW64
        .Case(F_SETLKW64, Allow())
#endif
        .Default(SandboxPolicyCommon::EvaluateSyscall(sysno));
    }

    case __NR_mprotect:
    case __NR_brk:
    case __NR_madvise:
      // libc's realloc uses mremap (Bug 1286119); wasm does too (bug 1342385).
    case __NR_mremap:
      return Allow();

    case __NR_sigaltstack:
      return Allow();

#ifdef __NR_set_thread_area
    case __NR_set_thread_area:
      return Allow();
#endif

    case __NR_getrusage:
    case __NR_times:
      return Allow();

    case __NR_dup:
      return Allow();

    CASES_FOR_getuid:
    CASES_FOR_getgid:
    CASES_FOR_geteuid:
    CASES_FOR_getegid:
      return Allow();

    case __NR_fsync:
    case __NR_msync:
      return Allow();

    case __NR_getpriority:
    case __NR_setpriority:
    case __NR_sched_get_priority_min:
    case __NR_sched_get_priority_max:
    case __NR_sched_getscheduler:
    case __NR_sched_setscheduler:
    case __NR_sched_getparam:
    case __NR_sched_setparam:
#ifdef DESKTOP
    case __NR_sched_getaffinity:
#endif
      return Allow();

#ifdef DESKTOP
    case __NR_pipe2:
      return Allow();

    CASES_FOR_getrlimit:
    case __NR_clock_getres:
    CASES_FOR_getresuid:
    CASES_FOR_getresgid:
      return Allow();

    case __NR_prlimit64: {
      // Allow only the getrlimit() use case.  (glibc seems to use
      // only pid 0 to indicate the current process; pid == getpid()
      // is equivalent and could also be allowed if needed.)
      Arg<pid_t> pid(0);
      // This is really a const struct ::rlimit*, but Arg<> doesn't
      // work with pointers, only integer types.
      Arg<uintptr_t> new_limit(2);
      return If(AllOf(pid == 0, new_limit == 0), Allow())
        .Else(InvalidSyscall());
    }

      // PulseAudio calls umask, even though it's unsafe in
      // multithreaded applications.  But, allowing it here doesn't
      // really do anything one way or the other, now that file
      // accesses are brokered to another process.
    case __NR_umask:
      return Allow();

    case __NR_kill: {
      Arg<int> sig(1);
      // PulseAudio uses kill(pid, 0) to check if purported owners of
      // shared memory files are still alive; see bug 1397753 for more
      // details.
      return If(sig == 0, Error(EPERM))
        .Else(InvalidSyscall());
    }

    case __NR_wait4:
#ifdef __NR_waitpid
    case __NR_waitpid:
#endif
      // NSPR will start a thread to wait for child processes even if
      // fork() fails; see bug 227246 and bug 1299581.
      return Error(ECHILD);

      // inotify_{add,rm}_watch take filesystem paths.  Pretend the
      // kernel doesn't support inotify; note that this could make
      // libgio attempt network connections for FAM.
    case __NR_inotify_init:
    case __NR_inotify_init1:
      return Error(ENOSYS);

    case __NR_eventfd2:
      return Allow();

#ifdef __NR_memfd_create
    case __NR_memfd_create:
      return Allow();
#endif

#ifdef __NR_rt_tgsigqueueinfo
      // Only allow to send signals within the process.
    case __NR_rt_tgsigqueueinfo: {
      Arg<pid_t> tgid(0);
      return If(tgid == getpid(), Allow())
        .Else(InvalidSyscall());
    }
#endif

    case __NR_mlock:
    case __NR_munlock:
      return Allow();

      // We can't usefully allow fork+exec, even on a temporary basis;
      // the child would inherit the seccomp-bpf policy and almost
      // certainly die from an unexpected SIGSYS.  We also can't have
      // fork() crash, currently, because there are too many system
      // libraries/plugins that try to run commands.  But they can
      // usually do something reasonable on error.
    case __NR_clone:
      return ClonePolicy(Error(EPERM));

#ifdef __NR_fadvise64
    case __NR_fadvise64:
      return Allow();
#endif

#ifdef __NR_fadvise64_64
    case __NR_fadvise64_64:
      return Allow();
#endif

    case __NR_fallocate:
      return Allow();

    case __NR_get_mempolicy:
      return Allow();

#endif // DESKTOP

#ifdef __NR_getrandom
    case __NR_getrandom:
      return Allow();
#endif

      // nsSystemInfo uses uname (and we cache an instance, so
      // the info remains present even if we block the syscall)
    case __NR_uname:
#ifdef DESKTOP
    case __NR_sysinfo:
#endif
      return Allow();

#ifdef MOZ_JPROF
    case __NR_setitimer:
      return Allow();
#endif // MOZ_JPROF

    default:
      return SandboxPolicyCommon::EvaluateSyscall(sysno);
    }
  }
};

UniquePtr<sandbox::bpf_dsl::Policy>
GetContentSandboxPolicy(SandboxBrokerClient* aMaybeBroker,
                        const std::vector<int>& aSyscallWhitelist)
{
  return MakeUnique<ContentSandboxPolicy>(aMaybeBroker, aSyscallWhitelist);
}
#endif // MOZ_CONTENT_SANDBOX


#ifdef MOZ_GMP_SANDBOX
// Unlike for content, the GeckoMediaPlugin seccomp-bpf policy needs
// to be an effective sandbox by itself, because we allow GMP on Linux
// systems where that's the only sandboxing mechanism we can use.
//
// Be especially careful about what this policy allows.
class GMPSandboxPolicy : public SandboxPolicyCommon {
  static intptr_t OpenTrap(const sandbox::arch_seccomp_data& aArgs,
                           void* aux)
  {
    const auto files = static_cast<const SandboxOpenedFiles*>(aux);
    const char* path;
    int flags;

    switch (aArgs.nr) {
#ifdef __NR_open
    case __NR_open:
      path = reinterpret_cast<const char*>(aArgs.args[0]);
      flags = static_cast<int>(aArgs.args[1]);
      break;
#endif
    case __NR_openat:
      // The path has to be absolute to match the pre-opened file (see
      // assertion in ctor) so the dirfd argument is ignored.
      path = reinterpret_cast<const char*>(aArgs.args[1]);
      flags = static_cast<int>(aArgs.args[2]);
      break;
    default:
      MOZ_CRASH("unexpected syscall number");
    }

    if ((flags & O_ACCMODE) != O_RDONLY) {
      SANDBOX_LOG_ERROR("non-read-only open of file %s attempted (flags=0%o)",
                        path, flags);
      return -EROFS;
    }
    int fd = files->GetDesc(path);
    if (fd < 0) {
      // SandboxOpenedFile::GetDesc already logged about this, if appropriate.
      return -ENOENT;
    }
    return fd;
  }

  static intptr_t SchedTrap(const sandbox::arch_seccomp_data& aArgs,
                            void* aux)
  {
    const pid_t tid = syscall(__NR_gettid);
    if (aArgs.args[0] == static_cast<uint64_t>(tid)) {
      return DoSyscall(aArgs.nr,
                       0,
                       aArgs.args[1],
                       aArgs.args[2],
                       aArgs.args[3],
                       aArgs.args[4],
                       aArgs.args[5]);
    }
    SANDBOX_LOG_ERROR("unsupported tid in SchedTrap");
    return BlockedSyscallTrap(aArgs, nullptr);
  }

  static intptr_t UnameTrap(const sandbox::arch_seccomp_data& aArgs,
                            void* aux)
  {
    const auto buf = reinterpret_cast<struct utsname*>(aArgs.args[0]);
    PodZero(buf);
    // The real uname() increases fingerprinting risk for no benefit.
    // This is close enough.
    strcpy(buf->sysname, "Linux");
    strcpy(buf->version, "3");
    return 0;
  };

  static intptr_t FcntlTrap(const sandbox::arch_seccomp_data& aArgs,
                            void* aux)
  {
    const auto cmd = static_cast<int>(aArgs.args[1]);
    switch (cmd) {
      // This process can't exec, so the actual close-on-exec flag
      // doesn't matter; have it always read as true and ignore writes.
    case F_GETFD:
      return O_CLOEXEC;
    case F_SETFD:
      return 0;
    default:
      return -ENOSYS;
    }
  }

  const SandboxOpenedFiles* mFiles;
public:
  explicit GMPSandboxPolicy(const SandboxOpenedFiles* aFiles)
  : mFiles(aFiles)
  { }

  ~GMPSandboxPolicy() override = default;

  ResultExpr EvaluateSyscall(int sysno) const override {
    switch (sysno) {
      // Simulate opening the plugin file.
#ifdef __NR_open
    case __NR_open:
#endif
    case __NR_openat:
      return Trap(OpenTrap, mFiles);

      // ipc::Shmem
    case __NR_mprotect:
      return Allow();
    case __NR_madvise: {
      Arg<int> advice(2);
      return If(advice == MADV_DONTNEED, Allow())
        .ElseIf(advice == MADV_FREE, Allow())
        .ElseIf(advice == MADV_HUGEPAGE, Allow())
        .ElseIf(advice == MADV_NOHUGEPAGE, Allow())
#ifdef MOZ_ASAN
        .ElseIf(advice == MADV_DONTDUMP, Allow())
#endif
        .Else(InvalidSyscall());
    }
    case __NR_brk:
    CASES_FOR_geteuid:
      return Allow();
    case __NR_sched_getparam:
    case __NR_sched_getscheduler:
    case __NR_sched_get_priority_min:
    case __NR_sched_get_priority_max:
    case __NR_sched_setscheduler: {
      Arg<pid_t> pid(0);
      return If(pid == 0, Allow())
        .Else(Trap(SchedTrap, nullptr));
    }

    // For clock(3) on older glibcs; bug 1304220.
    case __NR_times:
      return Allow();

    // Bug 1372428
    case __NR_uname:
      return Trap(UnameTrap, nullptr);
    CASES_FOR_fcntl:
      return Trap(FcntlTrap, nullptr);

    case __NR_dup:
      return Allow();

    default:
      return SandboxPolicyCommon::EvaluateSyscall(sysno);
    }
  }
};

UniquePtr<sandbox::bpf_dsl::Policy>
GetMediaSandboxPolicy(const SandboxOpenedFiles* aFiles)
{
  return UniquePtr<sandbox::bpf_dsl::Policy>(new GMPSandboxPolicy(aFiles));
}

#endif // MOZ_GMP_SANDBOX

} // namespace mozilla