Bug 1543388 - Update vendored memchr 2.x crate. r=glandium
authorHenri Sivonen <hsivonen@hsivonen.fi>
Mon, 15 Apr 2019 23:44:39 +0000
changeset 469627 dc16dbae4805
parent 469626 0e9aea59c0b5
child 469628 eb8d178cd72a
push id35878
push userapavel@mozilla.com
push dateTue, 16 Apr 2019 15:43:40 +0000
treeherdermozilla-central@258af4e91151 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersglandium
bugs1543388
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1543388 - Update vendored memchr 2.x crate. r=glandium Differential Revision: https://phabricator.services.mozilla.com/D27485
Cargo.lock
third_party/rust/memchr/.cargo-checksum.json
third_party/rust/memchr/Cargo.toml
third_party/rust/memchr/Makefile
third_party/rust/memchr/README.md
third_party/rust/memchr/appveyor.yml
third_party/rust/memchr/benches/bench.rs
third_party/rust/memchr/build.rs
third_party/rust/memchr/ctags.rust
third_party/rust/memchr/session.vim
third_party/rust/memchr/src/c.rs
third_party/rust/memchr/src/fallback.rs
third_party/rust/memchr/src/iter.rs
third_party/rust/memchr/src/lib.rs
third_party/rust/memchr/src/naive.rs
third_party/rust/memchr/src/tests/iter.rs
third_party/rust/memchr/src/tests/memchr.rs
third_party/rust/memchr/src/tests/mod.rs
third_party/rust/memchr/src/x86/avx.rs
third_party/rust/memchr/src/x86/mod.rs
third_party/rust/memchr/src/x86/sse2.rs
third_party/rust/memchr/src/x86/sse42.rs
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -16,17 +16,17 @@ name = "adler32"
 version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
 name = "aho-corasick"
 version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "ansi_term"
 version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "winapi 0.3.6 (git+https://github.com/froydnj/winapi-rs?branch=aarch64)",
@@ -1658,21 +1658,18 @@ name = "memchr"
 version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "libc 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "memchr"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-dependencies = [
- "libc 0.2.51 (registry+https://github.com/rust-lang/crates.io-index)",
-]
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
 name = "memmap"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "fs2 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
  "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1919,17 +1916,17 @@ dependencies = [
  "memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "nom"
 version = "4.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "nserror"
 version = "0.1.0"
 dependencies = [
  "nsstring 0.1.0",
 ]
@@ -2298,17 +2295,17 @@ dependencies = [
 ]
 
 [[package]]
 name = "regex"
 version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "aho-corasick 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)",
- "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "regex-syntax 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
  "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "regex-syntax"
 version = "0.4.1"
@@ -3597,17 +3594,17 @@ dependencies = [
 "checksum lmdb-rkv 0.11.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1452294309db7977dc75e1e8135a8c654d9e52e04ff0c0bd06c880897a91defd"
 "checksum lmdb-rkv-sys 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "96846a2e6785ec0fce6577479d18273c8e5b287e6df8a1b398b7f0f7a41cdcbb"
 "checksum lock_api 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "62ebf1391f6acad60e5c8b43706dde4582df75c06698ab44511d15016bc2442c"
 "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
 "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6"
 "checksum malloc_size_of_derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "35adee9ed962cf7d07d62cb58bc45029f3227f5b5b86246caa8632f06c187bc3"
 "checksum matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "100aabe6b8ff4e4a7e32c1c13523379802df0772b82466207ac25b013f193376"
 "checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
-"checksum memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "796fba70e76612589ed2ce7f45282f5af869e0fdd7cc6199fa1aa1f1d591ba9d"
+"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39"
 "checksum memmap 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "46f3c7359028b31999287dae4e5047ddfe90a23b7dca2282ce759b491080c99b"
 "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
 "checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3"
 "checksum miniz_oxide 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "aaa2d3ad070f428fffbd7d3ca2ea20bb0d8cffe9024405c44e1840bc1418b398"
 "checksum miniz_oxide_c_api 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "92d98fdbd6145645828069b37ea92ca3de225e000d80702da25c20d3584b38a5"
 "checksum mio 0.6.15 (registry+https://github.com/rust-lang/crates.io-index)" = "4fcfcb32d63961fb6f367bfd5d21e4600b92cd310f71f9dca25acae196eb1560"
 "checksum mio-named-pipes 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "82f43a815b57d2d652550f3d20cec88a495bb2d0956aa873dc43040278455677"
 "checksum mio-uds 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1731a873077147b626d89cc6c2a0db6288d607496c5d10c0cfcf3adc697ec673"
--- a/third_party/rust/memchr/.cargo-checksum.json
+++ b/third_party/rust/memchr/.cargo-checksum.json
@@ -1,1 +1,1 @@
-{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"b37fbdbb466bbb057f4d715381c45ab26e37cdc469b97d901abdcb4b44733fc1","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","Makefile":"a45a128685a2ae7d4fa39d310786674417ee113055ef290a11f88002285865fc","README.md":"74e385c51a2402527a61a500d66e509fea97961f15bfffab85040064e576fe31","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","appveyor.yml":"b5c1a28f805854370f24e530df912764a9520f4581b33da090f44cec0eef181c","benches/bench.rs":"87cfb76154c3c322691201c6f5649b37665ed8bf1cf303bca971309a4eef6b61","ctags.rust":"3d128d3cc59f702e68953ba2fe6c3f46bc6991fc575308db060482d5da0c79f3","session.vim":"95cb1d7caf0ff7fbe76ec911988d908ddd883381c925ba64b537695bc9f021c4","src/lib.rs":"bd483dd7732610710f592861a77c733a321600267cf0a8237b5ac1b05d5e3c20"},"package":"796fba70e76612589ed2ce7f45282f5af869e0fdd7cc6199fa1aa1f1d591ba9d"}
\ No newline at end of file
+{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"47408824ff8a0861c0df0cb499f687f7463e461c3b81ea20edf1662b7dc5a121","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"ea4632001f2b384c67278e7114a843ab7094d6011ef87a793881e73cfb525120","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","build.rs":"a8483a0649fa418db667ccd2a16a60e57d886964b54c1af12923b6f6cb4f2c92","src/c.rs":"86fe35cbb46c8bece9927fbde20f1ca3af526defdde05ac969ad2f4bc9bb25e9","src/fallback.rs":"a79752e3bdc3c16febef90fcddb560f80659f802fac202cce3fdffd0b78f6d08","src/iter.rs":"5949fd42b266d3edebf133172c74700d1c0249bdd26c203bcfd1409583e7b502","src/lib.rs":"1b3c131d6ec66837d3d76ad42aa5fc4aebfae232587787bdedb5be2352be4502","src/naive.rs":"d908e5895586ef88913ee10ff3135aabd20363e0e0871c7006a5d40457851deb","src/tests/iter.rs":"262c09e5cabd1caef533475832da8716638b142ec8015e8270c6f5240e478ac1","src/tests/memchr.rs":"f30074eeab99a16ce5ca8a30f1890f86c43c0422523a7195cbb3ca5f3e465b67","src/tests/mod.rs":"8ad1d065d422877ee043f66f987a736a0216757ba66dc1a05bee2a7c949c8037","src/x86/avx.rs":"11d4a149007fde9f34168fa43b3cba700667782c987796dca8526564b3e01007","src/x86/mod.rs":"5032eec7355bc110ec99b06b2cfd916d8bc1e28c44a33f6c92c3cc86f797d683","src/x86/sse2.rs":"103bb9d555be789e678f1af5baa737e13911f60e90888d25fa2883b39a9dffce","src/x86/sse42.rs":"f671ae9dd2b518a823e499a09ce32d4957bc5ae043db90d61c027e32f688f2b2"},"package":"2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39"}
\ No newline at end of file
--- a/third_party/rust/memchr/Cargo.toml
+++ b/third_party/rust/memchr/Cargo.toml
@@ -7,18 +7,19 @@
 #
 # If you believe there's an error in this file please file an
 # issue against the rust-lang/cargo repository. If you're
 # editing this file be aware that the upstream Cargo.toml
 # will likely look very different (and much more reasonable)
 
 [package]
 name = "memchr"
-version = "2.0.1"
+version = "2.2.0"
 authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
+exclude = ["/ci/*", "/.travis.yml", "/Makefile", "/appveyor.yml"]
 description = "Safe interface to memchr."
 homepage = "https://github.com/BurntSushi/rust-memchr"
 documentation = "https://docs.rs/memchr/"
 readme = "README.md"
 keywords = ["memchr", "char", "scan", "strchr", "string"]
 license = "Unlicense/MIT"
 repository = "https://github.com/BurntSushi/rust-memchr"
 [profile.test]
@@ -27,19 +28,19 @@ opt-level = 3
 [lib]
 name = "memchr"
 bench = false
 [dependencies.libc]
 version = "0.2.18"
 optional = true
 default-features = false
 [dev-dependencies.quickcheck]
-version = "0.5"
+version = "0.8"
 default-features = false
 
 [features]
-default = ["use_std", "libc"]
-use_std = ["libc", "libc/use_std"]
+default = ["use_std"]
+use_std = []
 [badges.appveyor]
 repository = "BurntSushi/rust-memchr"
 
 [badges.travis-ci]
 repository = "BurntSushi/rust-memchr"
deleted file mode 100644
--- a/third_party/rust/memchr/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-all:
-	echo Nothing to do...
-
-ctags:
-	ctags --recurse --options=ctags.rust --languages=Rust
-
-docs:
-	cargo doc
-	in-dir ./target/doc fix-perms
-	rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/
-
-push:
-	git push origin master
-	git push github master
--- a/third_party/rust/memchr/README.md
+++ b/third_party/rust/memchr/README.md
@@ -1,36 +1,50 @@
-This crate provides a safe interface `libc`'s `memchr` and `memrchr`.
-This crate also provides fallback implementations when either function is
-unavailable.
+memchr
+======
+The `memchr` crate provides heavily optimized routines for searching bytes.
 
 [![Build status](https://api.travis-ci.org/BurntSushi/rust-memchr.png)](https://travis-ci.org/BurntSushi/rust-memchr)
 [![Build status](https://ci.appveyor.com/api/projects/status/8i9484t8l4w7uql0/branch/master?svg=true)](https://ci.appveyor.com/project/BurntSushi/rust-memchr/branch/master)
 [![](http://meritbadge.herokuapp.com/memchr)](https://crates.io/crates/memchr)
 
 Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
 
 
 ### Documentation
 
 [https://docs.rs/memchr](https://docs.rs/memchr)
 
-### no_std
+
+### Overview
+
+The `memchr` function is traditionally provided by libc, however, the
+performance of `memchr` can vary significantly depending on the specific
+implementation of libc that is used. They can range from manually tuned
+Assembly implementations (like that found in GNU's libc) all the way to
+non-vectorized C implementations (like that found in MUSL).
+
+To smooth out the differences between implementations of libc, at least
+on `x86_64` for Rust 1.27+, this crate provides its own implementation of
+`memchr` that should perform competitively with the one found in GNU's libc.
+The implementation is in pure Rust and has no dependency on a C compiler or an
+Assembler.
+
+Additionally, GNU libc also provides an extension, `memrchr`. This crate
+provides its own implementation of `memrchr` as well, on top of `memchr2`,
+`memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and
+`memchr2` is that that `memchr2` permits finding all occurrences of two bytes
+instead of one. Similarly for `memchr3`.
+
+### Compiling without the standard library
 
 memchr links to the standard library by default, but you can disable the
 `use_std` feature if you want to use it in a `#![no_std]` crate:
 
 ```toml
 [dependencies]
-memchr = { version = "1.0", default-features = false }
+memchr = { version = "2", default-features = false }
 ```
 
-### Performance
-
-On my system (Linux/amd64), `memchr` is about an order of magnitude faster than
-the more idiomatic `haystack.iter().position(|&b| b == needle)`:
-
-```
-test iterator          ... bench:       5,280 ns/iter (+/- 13) = 1893 MB/s
-test iterator_reversed ... bench:       5,271 ns/iter (+/- 7) = 1897 MB/s
-test libc_memchr       ... bench:         202 ns/iter (+/- 0) = 49504 MB/s
-test libc_memrchr      ... bench:         197 ns/iter (+/- 1) = 50761 MB/s
-```
+On x86 platforms, when the `use_std` feature is disabled, the SSE2
+implementation of memchr will be used in compilers that support it. When
+`use_std` is enabled, the AVX implementation of memchr will be used if the CPU
+is determined to support it at runtime.
deleted file mode 100644
--- a/third_party/rust/memchr/appveyor.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-environment:
-  matrix:
-  - TARGET: x86_64-pc-windows-msvc
-  - TARGET: i686-pc-windows-msvc
-  - TARGET: i686-pc-windows-gnu
-install:
-  - ps: Start-FileDownload "https://static.rust-lang.org/dist/rust-nightly-${env:TARGET}.exe"
-  - rust-nightly-%TARGET%.exe /VERYSILENT /NORESTART /DIR="C:\Program Files (x86)\Rust"
-  - SET PATH=%PATH%;C:\Program Files (x86)\Rust\bin
-  - SET PATH=%PATH%;C:\MinGW\bin
-  - rustc -V
-  - cargo -V
-
-build: false
-
-test_script:
-  - cargo build --verbose
-  - cargo test --verbose
-  - cargo bench --verbose
deleted file mode 100644
--- a/third_party/rust/memchr/benches/bench.rs
+++ /dev/null
@@ -1,117 +0,0 @@
-#![feature(test)]
-
-extern crate memchr;
-extern crate test;
-
-use std::iter;
-
-fn bench_data() -> Vec<u8> { iter::repeat(b'z').take(10000).collect() }
-
-#[bench]
-fn iterator_memchr(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let needle = b'a';
-    b.iter(|| {
-        assert!(haystack.iter().position(|&b| b == needle).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn optimized_memchr(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let needle = b'a';
-    b.iter(|| {
-        assert!(memchr::memchr(needle, &haystack).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn iterator_memrchr(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let needle = b'a';
-    b.iter(|| {
-        assert!(haystack.iter().rposition(|&b| b == needle).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn optimized_memrchr(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let needle = b'a';
-    b.iter(|| {
-        assert!(memchr::memrchr(needle, &haystack).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn iterator_memchr2(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let (needle1, needle2) = (b'a', b'b');
-    b.iter(|| {
-        assert!(haystack.iter().position(|&b| {
-            b == needle1 || b == needle2
-        }).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn manual_memchr2(b: &mut test::Bencher) {
-    fn find_singles(
-        sparse: &[bool],
-        text: &[u8],
-    ) -> Option<(usize, usize)> {
-        for (hi, &b) in text.iter().enumerate() {
-            if sparse[b as usize] {
-                return Some((hi, hi+1));
-            }
-        }
-        None
-    }
-
-    let haystack = bench_data();
-    let mut sparse = vec![false; 256];
-    sparse[b'a' as usize] = true;
-    sparse[b'b' as usize] = true;
-    b.iter(|| {
-        assert!(find_singles(&sparse, &haystack).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn optimized_memchr2(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let (needle1, needle2) = (b'a', b'b');
-    b.iter(|| {
-        assert!(memchr::memchr2(needle1, needle2, &haystack).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn iterator_memchr3(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let (needle1, needle2, needle3) = (b'a', b'b', b'c');
-    b.iter(|| {
-        assert!(haystack.iter().position(|&b| {
-            b == needle1 || b == needle2 || b == needle3
-        }).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
-
-#[bench]
-fn optimized_memchr3(b: &mut test::Bencher) {
-    let haystack = bench_data();
-    let (needle1, needle2, needle3) = (b'a', b'b', b'c');
-    b.iter(|| {
-        assert!(memchr::memchr3(
-            needle1, needle2, needle3, &haystack).is_none());
-    });
-    b.bytes = haystack.len() as u64;
-}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/build.rs
@@ -0,0 +1,134 @@
+use std::env;
+use std::ffi::OsString;
+use std::process::Command;
+
+fn main() {
+    let version = match Version::read() {
+        Ok(version) => version,
+        Err(err) => {
+            eprintln!("failed to parse `rustc --version`: {}", err);
+            return;
+        }
+    };
+    enable_simd_optimizations(version);
+    enable_libc();
+}
+
+// This adds various simd cfgs if this compiler supports it.
+//
+// This can be disabled with RUSTFLAGS="--cfg memchr_disable_auto_simd", but
+// this is generally only intended for testing.
+fn enable_simd_optimizations(version: Version) {
+    if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD") {
+        return;
+    }
+    if version < (Version { major: 1, minor: 27, patch: 0 }) {
+        return;
+    }
+
+    println!("cargo:rustc-cfg=memchr_runtime_simd");
+    println!("cargo:rustc-cfg=memchr_runtime_sse2");
+    println!("cargo:rustc-cfg=memchr_runtime_sse42");
+    println!("cargo:rustc-cfg=memchr_runtime_avx");
+}
+
+// This adds a `memchr_libc` cfg if and only if libc can be used, if no other
+// better option is available.
+//
+// This could be performed in the source code, but it's simpler to do it once
+// here and consolidate it into one cfg knob.
+//
+// Basically, we use libc only if its enabled and if we aren't targeting a
+// known bad platform. For example, wasm32 doesn't have a libc and the
+// performance of memchr on Windows is seemingly worse than the fallback
+// implementation.
+fn enable_libc() {
+    const NO_ARCH: &'static [&'static str] = &["wasm32", "windows"];
+    const NO_ENV: &'static [&'static str] = &["sgx"];
+
+    if !is_feature_set("LIBC") {
+        return;
+    }
+
+    let arch = match env::var("CARGO_CFG_TARGET_ARCH") {
+        Err(_) => return,
+        Ok(arch) => arch,
+    };
+    let env = match env::var("CARGO_CFG_TARGET_ENV") {
+        Err(_) => return,
+        Ok(env) => env,
+    };
+    if NO_ARCH.contains(&&*arch) || NO_ENV.contains(&&*env) {
+        return;
+    }
+
+    println!("cargo:rustc-cfg=memchr_libc");
+}
+
+fn is_feature_set(name: &str) -> bool {
+    is_env_set(&format!("CARGO_FEATURE_{}",  name))
+}
+
+fn is_env_set(name: &str) -> bool {
+    env::var_os(name).is_some()
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)]
+struct Version {
+    major: u32,
+    minor: u32,
+    patch: u32,
+}
+
+impl Version {
+    fn read() -> Result<Version, String> {
+        let rustc = env::var_os("RUSTC").unwrap_or(OsString::from("rustc"));
+        let output = Command::new(&rustc)
+            .arg("--version")
+            .output()
+            .unwrap()
+            .stdout;
+        Version::parse(&String::from_utf8(output).unwrap())
+    }
+
+    fn parse(mut s: &str) -> Result<Version, String> {
+        if !s.starts_with("rustc ") {
+            return Err(format!("unrecognized version string: {}", s));
+        }
+        s = &s["rustc ".len()..];
+
+        let parts: Vec<&str> = s.split(".").collect();
+        if parts.len() < 3 {
+            return Err(format!("not enough version parts: {:?}", parts));
+        }
+
+        let mut num = String::new();
+        for c in parts[0].chars() {
+            if !c.is_digit(10) {
+                break;
+            }
+            num.push(c);
+        }
+        let major = num.parse::<u32>().map_err(|e| e.to_string())?;
+
+        num.clear();
+        for c in parts[1].chars() {
+            if !c.is_digit(10) {
+                break;
+            }
+            num.push(c);
+        }
+        let minor = num.parse::<u32>().map_err(|e| e.to_string())?;
+
+        num.clear();
+        for c in parts[2].chars() {
+            if !c.is_digit(10) {
+                break;
+            }
+            num.push(c);
+        }
+        let patch = num.parse::<u32>().map_err(|e| e.to_string())?;
+
+        Ok(Version { major, minor, patch })
+    }
+}
deleted file mode 100644
--- a/third_party/rust/memchr/ctags.rust
+++ /dev/null
@@ -1,11 +0,0 @@
---langdef=Rust
---langmap=Rust:.rs
---regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
---regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
---regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
---regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
---regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
---regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
---regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
---regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
---regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/
deleted file mode 100644
--- a/third_party/rust/memchr/session.vim
+++ /dev/null
@@ -1,1 +0,0 @@
-au BufWritePost *.rs silent!make ctags > /dev/null 2>&1
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/c.rs
@@ -0,0 +1,44 @@
+// This module defines safe wrappers around memchr (POSIX) and memrchr (GNU
+// extension).
+
+#![allow(dead_code)]
+
+extern crate libc;
+
+use self::libc::{c_int, c_void, size_t};
+
+pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    let p = unsafe {
+        libc::memchr(
+            haystack.as_ptr() as *const c_void,
+            needle as c_int,
+            haystack.len() as size_t,
+        )
+    };
+    if p.is_null() {
+        None
+    } else {
+        Some(p as usize - (haystack.as_ptr() as usize))
+    }
+}
+
+// memrchr is a GNU extension. We know it's available on Linux, so start there.
+#[cfg(target_os = "linux")]
+pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    // GNU's memrchr() will - unlike memchr() - error if haystack is empty.
+    if haystack.is_empty() {
+        return None;
+    }
+    let p = unsafe {
+        libc::memrchr(
+            haystack.as_ptr() as *const c_void,
+            needle as c_int,
+            haystack.len() as size_t,
+        )
+    };
+    if p.is_null() {
+        None
+    } else {
+        Some(p as usize - (haystack.as_ptr() as usize))
+    }
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/fallback.rs
@@ -0,0 +1,346 @@
+// This module defines pure Rust platform independent implementations of all
+// the memchr routines. We do our best to make them fast. Some of them may even
+// get auto-vectorized.
+
+use core::cmp;
+use core::ptr;
+use core::usize;
+
+#[cfg(target_pointer_width = "32")]
+const USIZE_BYTES: usize = 4;
+
+#[cfg(target_pointer_width = "64")]
+const USIZE_BYTES: usize = 8;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 2 * USIZE_BYTES;
+
+/// Return `true` if `x` contains any zero byte.
+///
+/// From *Matters Computational*, J. Arndt
+///
+/// "The idea is to subtract one from each of the bytes and then look for
+/// bytes where the borrow propagated all the way to the most significant
+/// bit."
+#[inline(always)]
+fn contains_zero_byte(x: usize) -> bool {
+    const LO_U64: u64 = 0x0101010101010101;
+    const HI_U64: u64 = 0x8080808080808080;
+
+    const LO_USIZE: usize = LO_U64 as usize;
+    const HI_USIZE: usize = HI_U64 as usize;
+
+    x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0
+}
+
+/// Repeat the given byte into a word size number. That is, every 8 bits
+/// is equivalent to the given byte. For example, if `b` is `\x4E` or
+/// `01001110` in binary, then the returned value on a 32-bit system would be:
+/// `01001110_01001110_01001110_01001110`.
+#[inline(always)]
+fn repeat_byte(b: u8) -> usize {
+    (b as usize) * (usize::MAX / 255)
+}
+
+pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let confirm = |byte| byte == n1;
+    let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = read_unaligned_usize(ptr);
+        if contains_zero_byte(chunk ^ vn1) {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
+        while loop_size == LOOP_SIZE && ptr <= ptr_sub(end_ptr, loop_size) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let a = *(ptr as *const usize);
+            let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);
+            let eqa = contains_zero_byte(a ^ vn1);
+            let eqb = contains_zero_byte(b ^ vn1);
+            if eqa || eqb {
+                break;
+            }
+            ptr = ptr_add(ptr, LOOP_SIZE);
+        }
+        forward_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memchr`, but searches for two bytes instead of one.
+pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let confirm = |byte| byte == n1 || byte == n2;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = read_unaligned_usize(ptr);
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        if eq1 || eq2 {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
+        while ptr <= ptr_sub(end_ptr, USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            if eq1 || eq2 {
+                break;
+            }
+            ptr = ptr_add(ptr, USIZE_BYTES);
+        }
+        forward_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memchr`, but searches for three bytes instead of one.
+pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let vn3 = repeat_byte(n3);
+    let confirm = |byte| byte == n1 || byte == n2 || byte == n3;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = read_unaligned_usize(ptr);
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        let eq3 = contains_zero_byte(chunk ^ vn3);
+        if eq1 || eq2 || eq3 {
+            return forward_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
+        while ptr <= ptr_sub(end_ptr, USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            let eq3 = contains_zero_byte(chunk ^ vn3);
+            if eq1 || eq2 || eq3 {
+                break;
+            }
+            ptr = ptr_add(ptr, USIZE_BYTES);
+        }
+        forward_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Return the last index matching the byte `x` in `text`.
+pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let confirm = |byte| byte == n1;
+    let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = read_unaligned_usize(ptr_sub(ptr, USIZE_BYTES));
+        if contains_zero_byte(chunk ^ vn1) {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = (end_ptr as usize & !align) as *const u8;
+        debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+        while loop_size == LOOP_SIZE && ptr >= ptr_add(start_ptr, loop_size) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let a = *(ptr_sub(ptr, 2 * USIZE_BYTES) as *const usize);
+            let b = *(ptr_sub(ptr, 1 * USIZE_BYTES) as *const usize);
+            let eqa = contains_zero_byte(a ^ vn1);
+            let eqb = contains_zero_byte(b ^ vn1);
+            if eqa || eqb {
+                break;
+            }
+            ptr = ptr_sub(ptr, loop_size);
+        }
+        reverse_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memrchr`, but searches for two bytes instead of one.
+pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let confirm = |byte| byte == n1 || byte == n2;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = read_unaligned_usize(ptr_sub(ptr, USIZE_BYTES));
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        if eq1 || eq2 {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = (end_ptr as usize & !align) as *const u8;
+        debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+        while ptr >= ptr_add(start_ptr, USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr_sub(ptr, USIZE_BYTES) as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            if eq1 || eq2 {
+                break;
+            }
+            ptr = ptr_sub(ptr, USIZE_BYTES);
+        }
+        reverse_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+/// Like `memrchr`, but searches for three bytes instead of one.
+pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = repeat_byte(n1);
+    let vn2 = repeat_byte(n2);
+    let vn3 = repeat_byte(n3);
+    let confirm = |byte| byte == n1 || byte == n2 || byte == n3;
+    let align = USIZE_BYTES - 1;
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    unsafe {
+        if haystack.len() < USIZE_BYTES {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        let chunk = read_unaligned_usize(ptr_sub(ptr, USIZE_BYTES));
+        let eq1 = contains_zero_byte(chunk ^ vn1);
+        let eq2 = contains_zero_byte(chunk ^ vn2);
+        let eq3 = contains_zero_byte(chunk ^ vn3);
+        if eq1 || eq2 || eq3 {
+            return reverse_search(start_ptr, end_ptr, ptr, confirm);
+        }
+
+        ptr = (end_ptr as usize & !align) as *const u8;
+        debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+        while ptr >= ptr_add(start_ptr, USIZE_BYTES) {
+            debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+            let chunk = *(ptr_sub(ptr, USIZE_BYTES) as *const usize);
+            let eq1 = contains_zero_byte(chunk ^ vn1);
+            let eq2 = contains_zero_byte(chunk ^ vn2);
+            let eq3 = contains_zero_byte(chunk ^ vn3);
+            if eq1 || eq2 || eq3 {
+                break;
+            }
+            ptr = ptr_sub(ptr, USIZE_BYTES);
+        }
+        reverse_search(start_ptr, end_ptr, ptr, confirm)
+    }
+}
+
+#[inline(always)]
+unsafe fn forward_search<F: Fn(u8) -> bool>(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    mut ptr: *const u8,
+    confirm: F,
+) -> Option<usize> {
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr);
+
+    while ptr < end_ptr {
+        if confirm(*ptr) {
+            return Some(sub(ptr, start_ptr));
+        }
+        ptr = ptr.offset(1);
+    }
+    None
+}
+
+#[inline(always)]
+unsafe fn reverse_search<F: Fn(u8) -> bool>(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    mut ptr: *const u8,
+    confirm: F,
+) -> Option<usize> {
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr);
+
+    while ptr > start_ptr {
+        ptr = ptr.offset(-1);
+        if confirm(*ptr) {
+            return Some(sub(ptr, start_ptr));
+        }
+    }
+    None
+}
+
+/// Increment the given pointer by the given amount.
+unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
+    debug_assert!(amt < ::core::isize::MAX as usize);
+    ptr.offset(amt as isize)
+}
+
+/// Decrement the given pointer by the given amount.
+unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
+    debug_assert!(amt < ::core::isize::MAX as usize);
+    ptr.offset((amt as isize).wrapping_neg())
+}
+
+unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
+    let mut n: usize = 0;
+    ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES);
+    n
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/iter.rs
@@ -0,0 +1,177 @@
+use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
+macro_rules! iter_next {
+    // Common code for the memchr iterators:
+    // update haystack and position and produce the index
+    //
+    // self: &mut Self where Self is the iterator
+    // search_result: Option<usize> which is the result of the corresponding
+    // memchr function.
+    //
+    // Returns Option<usize> (the next iterator element)
+    ($self_:expr, $search_result:expr) => {
+        $search_result.map(move |index| {
+            // split and take the remaining back half
+            $self_.haystack = $self_.haystack.split_at(index + 1).1;
+            let found_position = $self_.position + index;
+            $self_.position = found_position + 1;
+            found_position
+        })
+    }
+}
+
+macro_rules! iter_next_back {
+    ($self_:expr, $search_result:expr) => {
+        $search_result.map(move |index| {
+            // split and take the remaining front half
+            $self_.haystack = $self_.haystack.split_at(index).0;
+            $self_.position + index
+        })
+    }
+}
+
+/// An iterator for `memchr`.
+pub struct Memchr<'a> {
+    needle: u8,
+    // The haystack to iterate over
+    haystack: &'a [u8],
+    // The index
+    position: usize,
+}
+
+impl<'a> Memchr<'a> {
+    /// Creates a new iterator that yields all positions of needle in haystack.
+    #[inline]
+    pub fn new(needle: u8, haystack: &[u8]) -> Memchr {
+        Memchr {
+            needle: needle,
+            haystack: haystack,
+            position: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for Memchr<'a> {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        iter_next!(self, memchr(self.needle, self.haystack))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
+
+impl<'a> DoubleEndedIterator for Memchr<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        iter_next_back!(self, memrchr(self.needle, self.haystack))
+    }
+}
+
+/// An iterator for `memchr2`.
+pub struct Memchr2<'a> {
+    needle1: u8,
+    needle2: u8,
+    // The haystack to iterate over
+    haystack: &'a [u8],
+    // The index
+    position: usize,
+}
+
+impl<'a> Memchr2<'a> {
+    /// Creates a new iterator that yields all positions of needle in haystack.
+    #[inline]
+    pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 {
+        Memchr2 {
+            needle1: needle1,
+            needle2: needle2,
+            haystack: haystack,
+            position: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for Memchr2<'a> {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        iter_next!(self, memchr2(self.needle1, self.needle2, self.haystack))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
+
+impl<'a> DoubleEndedIterator for Memchr2<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        iter_next_back!(
+            self,
+            memrchr2(self.needle1, self.needle2, self.haystack)
+        )
+    }
+}
+
+/// An iterator for `memchr3`.
+pub struct Memchr3<'a> {
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    // The haystack to iterate over
+    haystack: &'a [u8],
+    // The index
+    position: usize,
+}
+
+impl<'a> Memchr3<'a> {
+    /// Create a new `Memchr3` that's initialized to zero with a haystack
+    #[inline]
+    pub fn new(
+        needle1: u8,
+        needle2: u8,
+        needle3: u8,
+        haystack: &[u8],
+    ) -> Memchr3 {
+        Memchr3 {
+            needle1: needle1,
+            needle2: needle2,
+            needle3: needle3,
+            haystack: haystack,
+            position: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for Memchr3<'a> {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        iter_next!(
+            self,
+            memchr3(self.needle1, self.needle2, self.needle3, self.haystack)
+        )
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
+
+impl<'a> DoubleEndedIterator for Memchr3<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        iter_next_back!(
+            self,
+            memrchr3(self.needle1, self.needle2, self.needle3, self.haystack)
+        )
+    }
+}
--- a/third_party/rust/memchr/src/lib.rs
+++ b/third_party/rust/memchr/src/lib.rs
@@ -1,1062 +1,312 @@
 /*!
-This crate defines two functions, `memchr` and `memrchr`, which expose a safe
-interface to the corresponding functions in `libc`.
+The `memchr` crate provides heavily optimized routines for searching bytes.
+
+The `memchr` function is traditionally provided by libc, however, the
+performance of `memchr` can vary significantly depending on the specific
+implementation of libc that is used. They can range from manually tuned
+Assembly implementations (like that found in GNU's libc) all the way to
+non-vectorized C implementations (like that found in MUSL).
+
+To smooth out the differences between implementations of libc, at least
+on `x86_64` for Rust 1.27+, this crate provides its own implementation of
+`memchr` that should perform competitively with the one found in GNU's libc.
+The implementation is in pure Rust and has no dependency on a C compiler or an
+Assembler.
+
+Additionally, GNU libc also provides an extension, `memrchr`. This crate
+provides its own implementation of `memrchr` as well, on top of `memchr2`,
+`memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and
+`memchr2` is that that `memchr2` permits finding all occurrences of two bytes
+instead of one. Similarly for `memchr3`.
 */
 
-#![deny(missing_docs)]
-#![allow(unused_imports)]
-#![doc(html_root_url = "https://docs.rs/memchr/2.0.0")]
-
 #![cfg_attr(not(feature = "use_std"), no_std)]
 
-#[cfg(all(test, not(feature = "use_std")))]
-#[macro_use]
-extern crate std;
-
-#[cfg(all(feature = "libc", not(target_arch = "wasm32")))]
-extern crate libc;
+#![deny(missing_docs)]
+#![doc(html_root_url = "https://docs.rs/memchr/2.0.0")]
 
-#[macro_use]
-#[cfg(test)]
-extern crate quickcheck;
-
-#[cfg(all(feature = "libc", not(target_arch = "wasm32")))]
-use libc::c_void;
-#[cfg(all(feature = "libc", not(target_arch = "wasm32")))]
-use libc::{c_int, size_t};
+// Supporting 16-bit would be fine. If you need it, please submit a bug report
+// at https://github.com/BurntSushi/rust-memchr
+#[cfg(not(any(target_pointer_width = "32", target_pointer_width = "64")))]
+compile_error!("memchr currently not supported on non-32 or non-64 bit");
 
 #[cfg(feature = "use_std")]
-use std::cmp;
-#[cfg(not(feature = "use_std"))]
-use core::cmp;
+extern crate core;
 
-const LO_U64: u64 = 0x0101010101010101;
-const HI_U64: u64 = 0x8080808080808080;
+#[cfg(test)]
+#[macro_use]
+extern crate quickcheck;
 
-// use truncation
-const LO_USIZE: usize = LO_U64 as usize;
-const HI_USIZE: usize = HI_U64 as usize;
+use core::iter::Rev;
+
+pub use iter::{Memchr, Memchr2, Memchr3};
 
-#[cfg(target_pointer_width = "32")]
-const USIZE_BYTES: usize = 4;
-#[cfg(target_pointer_width = "64")]
-const USIZE_BYTES: usize = 8;
+// N.B. If you're looking for the cfg knobs for libc, see build.rs.
+#[cfg(memchr_libc)]
+mod c;
+#[allow(dead_code)]
+mod fallback;
+mod iter;
+mod naive;
+#[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+mod x86;
+#[cfg(test)]
+mod tests;
 
-/// Return `true` if `x` contains any zero byte.
-///
-/// From *Matters Computational*, J. Arndt
-///
-/// "The idea is to subtract one from each of the bytes and then look for
-/// bytes where the borrow propagated all the way to the most significant
-/// bit."
+/// An iterator over all occurrences of the needle in a haystack.
 #[inline]
-fn contains_zero_byte(x: usize) -> bool {
-    x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0
+pub fn memchr_iter(needle: u8, haystack: &[u8]) -> Memchr {
+    Memchr::new(needle, haystack)
 }
 
-#[cfg(target_pointer_width = "32")]
+/// An iterator over all occurrences of the needles in a haystack.
 #[inline]
-fn repeat_byte(b: u8) -> usize {
-    let mut rep = (b as usize) << 8 | b as usize;
-    rep = rep << 16 | rep;
-    rep
-}
-
-#[cfg(target_pointer_width = "64")]
-#[inline]
-fn repeat_byte(b: u8) -> usize {
-    let mut rep = (b as usize) << 8 | b as usize;
-    rep = rep << 16 | rep;
-    rep = rep << 32 | rep;
-    rep
+pub fn memchr2_iter(
+    needle1: u8,
+    needle2: u8,
+    haystack: &[u8],
+) -> Memchr2 {
+    Memchr2::new(needle1, needle2, haystack)
 }
 
-macro_rules! iter_next {
-    // Common code for the memchr iterators:
-    // update haystack and position and produce the index
-    //
-    // self: &mut Self where Self is the iterator
-    // search_result: Option<usize> which is the result of the corresponding
-    // memchr function.
-    //
-    // Returns Option<usize> (the next iterator element)
-    ($self_:expr, $search_result:expr) => {
-        $search_result.map(move |index| {
-            // split and take the remaining back half
-            $self_.haystack = $self_.haystack.split_at(index + 1).1;
-            let found_position = $self_.position + index;
-            $self_.position = found_position + 1;
-            found_position
-        })
-    }
+/// An iterator over all occurrences of the needles in a haystack.
+#[inline]
+pub fn memchr3_iter(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Memchr3 {
+    Memchr3::new(needle1, needle2, needle3, haystack)
 }
 
-macro_rules! iter_next_back {
-    ($self_:expr, $search_result:expr) => {
-        $search_result.map(move |index| {
-            // split and take the remaining front half
-            $self_.haystack = $self_.haystack.split_at(index).0;
-            $self_.position + index
-        })
-    }
+/// An iterator over all occurrences of the needle in a haystack, in reverse.
+#[inline]
+pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev<Memchr> {
+    Memchr::new(needle, haystack).rev()
 }
 
-/// An iterator for memchr
-pub struct Memchr<'a> {
-    needle: u8,
-    // The haystack to iterate over
-    haystack: &'a [u8],
-    // The index
-    position: usize,
-}
-
-impl<'a> Memchr<'a> {
-    /// Creates a new iterator that yields all positions of needle in haystack.
-    pub fn new(needle: u8, haystack: &[u8]) -> Memchr {
-        Memchr {
-            needle: needle,
-            haystack: haystack,
-            position: 0,
-        }
-    }
+/// An iterator over all occurrences of the needles in a haystack, in reverse.
+#[inline]
+pub fn memrchr2_iter(
+    needle1: u8,
+    needle2: u8,
+    haystack: &[u8],
+) -> Rev<Memchr2> {
+    Memchr2::new(needle1, needle2, haystack).rev()
 }
 
-impl<'a> Iterator for Memchr<'a> {
-    type Item = usize;
-
-    fn next(&mut self) -> Option<usize> {
-        iter_next!(self, memchr(self.needle, &self.haystack))
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (0, Some(self.haystack.len()))
-    }
+/// An iterator over all occurrences of the needles in a haystack, in reverse.
+#[inline]
+pub fn memrchr3_iter(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Rev<Memchr3> {
+    Memchr3::new(needle1, needle2, needle3, haystack).rev()
 }
 
-impl<'a> DoubleEndedIterator for Memchr<'a> {
-    fn next_back(&mut self) -> Option<Self::Item> {
-        iter_next_back!(self, memrchr(self.needle, &self.haystack))
-    }
-}
-
-/// A safe interface to `memchr`.
+/// Search for the first occurrence of a byte in a slice.
 ///
-/// Returns the index corresponding to the first occurrence of `needle` in
+/// This returns the index corresponding to the first occurrence of `needle` in
 /// `haystack`, or `None` if one is not found.
 ///
-/// memchr reduces to super-optimized machine code at around an order of
-/// magnitude faster than `haystack.iter().position(|&b| b == needle)`.
-/// (See benchmarks.)
+/// While this is operationally the same as something like
+/// `haystack.iter().position(|&b| b == needle)`, `memchr` will use a highly
+/// optimized routine that can be up to an order of magnitude faster in some
+/// cases.
 ///
 /// # Example
 ///
 /// This shows how to find the first position of a byte in a byte string.
 ///
-/// ```rust
+/// ```
 /// use memchr::memchr;
 ///
 /// let haystack = b"the quick brown fox";
 /// assert_eq!(memchr(b'k', haystack), Some(8));
 /// ```
-#[inline(always)] // reduces constant overhead
+#[inline]
 pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
-    // libc memchr
-    #[cfg(all(feature = "libc",
-              not(target_arch = "wasm32"),
-              any(not(target_os = "windows"),
-                  not(any(target_pointer_width = "32",
-                          target_pointer_width = "64")))))]
-    #[inline(always)] // reduces constant overhead
-    fn memchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
-        use libc::memchr as libc_memchr;
-
-        let p = unsafe {
-            libc_memchr(haystack.as_ptr() as *const c_void,
-                        needle as c_int,
-                        haystack.len() as size_t)
-        };
-        if p.is_null() {
-            None
-        } else {
-            Some(p as usize - (haystack.as_ptr() as usize))
-        }
-    }
-
-    // use fallback on windows, since it's faster
-    // use fallback on wasm32, since it doesn't have libc
-    #[cfg(all(any(not(feature = "libc"), target_os = "windows", target_arch = "wasm32"),
-              any(target_pointer_width = "32",
-                  target_pointer_width = "64")))]
-    fn memchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
-        fallback::memchr(needle, haystack)
-    }
-
-    // For the rare case of neither 32 bit nor 64-bit platform.
-    #[cfg(all(any(not(feature = "libc"), target_os = "windows"),
-              not(target_pointer_width = "32"),
-              not(target_pointer_width = "64")))]
-    fn memchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
-        haystack.iter().position(|&b| b == needle)
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memchr(n1, haystack)
     }
 
-    memchr_specific(needle, haystack)
-}
-
-/// A safe interface to `memrchr`.
-///
-/// Returns the index corresponding to the last occurrence of `needle` in
-/// `haystack`, or `None` if one is not found.
-///
-/// # Example
-///
-/// This shows how to find the last position of a byte in a byte string.
-///
-/// ```rust
-/// use memchr::memrchr;
-///
-/// let haystack = b"the quick brown fox";
-/// assert_eq!(memrchr(b'o', haystack), Some(17));
-/// ```
-#[inline(always)] // reduces constant overhead
-pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
-
-    #[cfg(all(feature = "libc", target_os = "linux"))]
-    #[inline(always)] // reduces constant overhead
-    fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
-        // GNU's memrchr() will - unlike memchr() - error if haystack is empty.
-        if haystack.is_empty() {
-            return None;
-        }
-        let p = unsafe {
-            libc::memrchr(haystack.as_ptr() as *const c_void,
-                          needle as c_int,
-                          haystack.len() as size_t)
-        };
-        if p.is_null() {
-            None
-        } else {
-            Some(p as usize - (haystack.as_ptr() as usize))
-        }
+    #[cfg(all(
+        memchr_libc,
+        not(all(target_arch = "x86_64", memchr_runtime_simd))
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        c::memchr(n1, haystack)
     }
 
-    #[cfg(all(not(all(feature = "libc", target_os = "linux")),
-              any(target_pointer_width = "32", target_pointer_width = "64")))]
-    fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
-        fallback::memrchr(needle, haystack)
-    }
-
-    // For the rare case of neither 32 bit nor 64-bit platform.
-    #[cfg(all(not(all(feature = "libc", target_os = "linux")),
-              not(target_pointer_width = "32"),
-              not(target_pointer_width = "64")))]
-    fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option<usize> {
-        haystack.iter().rposition(|&b| b == needle)
+    #[cfg(all(
+        not(memchr_libc),
+        not(all(target_arch = "x86_64", memchr_runtime_simd))
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memchr(n1, haystack)
     }
 
-    memrchr_specific(needle, haystack)
-}
-
-/// An iterator for Memchr2
-pub struct Memchr2<'a> {
-    needle1: u8,
-    needle2: u8,
-    // The haystack to iterate over
-    haystack: &'a [u8],
-    // The index
-    position: usize,
-}
-
-impl<'a> Memchr2<'a> {
-    /// Creates a new iterator that yields all positions of needle in haystack.
-    pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 {
-        Memchr2 {
-            needle1: needle1,
-            needle2: needle2,
-            haystack: haystack,
-            position: 0,
-        }
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle, haystack)
     }
 }
 
-impl<'a> Iterator for Memchr2<'a> {
-    type Item = usize;
-
-    fn next(&mut self) -> Option<usize> {
-        iter_next!(self, memchr2(self.needle1, self.needle2, &self.haystack))
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (0, Some(self.haystack.len()))
-    }
-}
-
-
 /// Like `memchr`, but searches for two bytes instead of one.
+#[inline]
 pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> {
-    fn slow(b1: u8, b2: u8, haystack: &[u8]) -> Option<usize> {
-        haystack.iter().position(|&b| b == b1 || b == b2)
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memchr2(n1, n2, haystack)
     }
 
-    let len = haystack.len();
-    let ptr = haystack.as_ptr();
-    let align = (ptr as usize) & (USIZE_BYTES - 1);
-    let mut i = 0;
-    if align > 0 {
-        i = cmp::min(USIZE_BYTES - align, len);
-        if let Some(found) = slow(needle1, needle2, &haystack[..i]) {
-            return Some(found);
-        }
-    }
-    let repeated_b1 = repeat_byte(needle1);
-    let repeated_b2 = repeat_byte(needle2);
-    if len >= USIZE_BYTES {
-        while i <= len - USIZE_BYTES {
-            unsafe {
-                let u = *(ptr.offset(i as isize) as *const usize);
-                let found_ub1 = contains_zero_byte(u ^ repeated_b1);
-                let found_ub2 = contains_zero_byte(u ^ repeated_b2);
-                if found_ub1 || found_ub2 {
-                    break;
-                }
-            }
-            i += USIZE_BYTES;
-        }
-    }
-    slow(needle1, needle2, &haystack[i..]).map(|pos| i + pos)
-}
-
-/// An iterator for Memchr3
-pub struct Memchr3<'a> {
-    needle1: u8,
-    needle2: u8,
-    needle3: u8,
-    // The haystack to iterate over
-    haystack: &'a [u8],
-    // The index
-    position: usize,
-}
-
-impl<'a> Memchr3<'a> {
-    /// Create a new Memchr2 that's initalized to zero with a haystack
-    pub fn new(
-        needle1: u8,
-        needle2: u8,
-        needle3: u8,
-        haystack: &[u8],
-    ) -> Memchr3 {
-        Memchr3 {
-            needle1: needle1,
-            needle2: needle2,
-            needle3: needle3,
-            haystack: haystack,
-            position: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for Memchr3<'a> {
-    type Item = usize;
-
-    fn next(&mut self) -> Option<usize> {
-        iter_next!(
-            self,
-            memchr3(self.needle1, self.needle2, self.needle3, &self.haystack)
-        )
+    #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memchr2(n1, n2, haystack)
     }
 
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (0, Some(self.haystack.len()))
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, haystack)
     }
 }
 
 /// Like `memchr`, but searches for three bytes instead of one.
+#[inline]
 pub fn memchr3(
     needle1: u8,
     needle2: u8,
     needle3: u8,
     haystack: &[u8],
 ) -> Option<usize> {
-    fn slow(b1: u8, b2: u8, b3: u8, haystack: &[u8]) -> Option<usize> {
-        haystack.iter().position(|&b| b == b1 || b == b2 || b == b3)
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memchr3(n1, n2, n3, haystack)
     }
 
-    let len = haystack.len();
-    let ptr = haystack.as_ptr();
-    let align = (ptr as usize) & (USIZE_BYTES - 1);
-    let mut i = 0;
-    if align > 0 {
-        i = cmp::min(USIZE_BYTES - align, len);
-        if let Some(found) = slow(needle1, needle2, needle3, &haystack[..i]) {
-            return Some(found);
-        }
-    }
-    let repeated_b1 = repeat_byte(needle1);
-    let repeated_b2 = repeat_byte(needle2);
-    let repeated_b3 = repeat_byte(needle3);
-    if len >= USIZE_BYTES {
-        while i <= len - USIZE_BYTES {
-            unsafe {
-                let u = *(ptr.offset(i as isize) as *const usize);
-                let found_ub1 = contains_zero_byte(u ^ repeated_b1);
-                let found_ub2 = contains_zero_byte(u ^ repeated_b2);
-                let found_ub3 = contains_zero_byte(u ^ repeated_b3);
-                if found_ub1 || found_ub2 || found_ub3 {
-                    break;
-                }
-            }
-            i += USIZE_BYTES;
-        }
-    }
-    slow(needle1, needle2, needle3, &haystack[i..]).map(|pos| i + pos)
-}
-
-#[allow(dead_code)]
-#[cfg(any(test, not(feature = "libc"), all(not(target_os = "linux"),
-          any(target_pointer_width = "32", target_pointer_width = "64"))))]
-mod fallback {
-    #[cfg(feature = "use_std")]
-    use std::cmp;
-    #[cfg(not(feature = "use_std"))]
-    use core::cmp;
-
-    use super::{
-        LO_U64, HI_U64, LO_USIZE, HI_USIZE, USIZE_BYTES,
-        contains_zero_byte, repeat_byte,
-    };
-
-    /// Return the first index matching the byte `a` in `text`.
-    pub fn memchr(x: u8, text: &[u8]) -> Option<usize> {
-        // Scan for a single byte value by reading two `usize` words at a time.
-        //
-        // Split `text` in three parts
-        // - unaligned inital part, before first word aligned address in text
-        // - body, scan by 2 words at a time
-        // - the last remaining part, < 2 word size
-        let len = text.len();
-        let ptr = text.as_ptr();
-
-        // search up to an aligned boundary
-        let align = (ptr as usize) & (USIZE_BYTES - 1);
-        let mut offset;
-        if align > 0 {
-            offset = cmp::min(USIZE_BYTES - align, len);
-            let pos = text[..offset].iter().position(|elt| *elt == x);
-            if let Some(index) = pos {
-                return Some(index);
-            }
-        } else {
-            offset = 0;
-        }
-
-        // search the body of the text
-        let repeated_x = repeat_byte(x);
-
-        if len >= 2 * USIZE_BYTES {
-            while offset <= len - 2 * USIZE_BYTES {
-                debug_assert_eq!((ptr as usize + offset) % USIZE_BYTES, 0);
-                unsafe {
-                    let u = *(ptr.offset(offset as isize) as *const usize);
-                    let v = *(ptr.offset((offset + USIZE_BYTES) as isize) as *const usize);
-
-                    // break if there is a matching byte
-                    let zu = contains_zero_byte(u ^ repeated_x);
-                    let zv = contains_zero_byte(v ^ repeated_x);
-                    if zu || zv {
-                        break;
-                    }
-                }
-                offset += USIZE_BYTES * 2;
-            }
-        }
-
-        // find the byte after the point the body loop stopped
-        text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i)
+    #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memchr3(n1, n2, n3, haystack)
     }
 
-    /// Return the last index matching the byte `a` in `text`.
-    pub fn memrchr(x: u8, text: &[u8]) -> Option<usize> {
-        // Scan for a single byte value by reading two `usize` words at a time.
-        //
-        // Split `text` in three parts
-        // - unaligned tail, after the last word aligned address in text
-        // - body, scan by 2 words at a time
-        // - the first remaining bytes, < 2 word size
-        let len = text.len();
-        let ptr = text.as_ptr();
-
-        // search to an aligned boundary
-        let end_align = (ptr as usize + len) & (USIZE_BYTES - 1);
-        let mut offset;
-        if end_align > 0 {
-            offset = if end_align >= len { 0 } else { len - end_align };
-            let pos = text[offset..].iter().rposition(|elt| *elt == x);
-            if let Some(index) = pos {
-                return Some(offset + index);
-            }
-        } else {
-            offset = len;
-        }
-
-        // search the body of the text
-        let repeated_x = repeat_byte(x);
-
-        while offset >= 2 * USIZE_BYTES {
-            debug_assert_eq!((ptr as usize + offset) % USIZE_BYTES, 0);
-            unsafe {
-                let u = *(ptr.offset(offset as isize - 2 * USIZE_BYTES as isize) as *const usize);
-                let v = *(ptr.offset(offset as isize - USIZE_BYTES as isize) as *const usize);
-
-                // break if there is a matching byte
-                let zu = contains_zero_byte(u ^ repeated_x);
-                let zv = contains_zero_byte(v ^ repeated_x);
-                if zu || zv {
-                    break;
-                }
-            }
-            offset -= 2 * USIZE_BYTES;
-        }
-
-        // find the byte before the point the body loop stopped
-        text[..offset].iter().rposition(|elt| *elt == x)
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, needle3, haystack)
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use std::prelude::v1::*;
-    use quickcheck;
-
-    use super::{memchr, memrchr, memchr2, memchr3, Memchr, Memchr2, Memchr3};
-    // Use a macro to test both native and fallback impls on all configurations
-    macro_rules! memchr_tests {
-        ($mod_name:ident, $memchr:path, $memrchr:path) => {
-            mod $mod_name {
-            use std::prelude::v1::*;
-            use quickcheck;
-            #[test]
-            fn matches_one() {
-                assert_eq!(Some(0), $memchr(b'a', b"a"));
-            }
-
-            #[test]
-            fn matches_begin() {
-                assert_eq!(Some(0), $memchr(b'a', b"aaaa"));
-            }
-
-            #[test]
-            fn matches_end() {
-                assert_eq!(Some(4), $memchr(b'z', b"aaaaz"));
-            }
-
-            #[test]
-            fn matches_nul() {
-                assert_eq!(Some(4), $memchr(b'\x00', b"aaaa\x00"));
-            }
-
-            #[test]
-            fn matches_past_nul() {
-                assert_eq!(Some(5), $memchr(b'z', b"aaaa\x00z"));
-            }
-
-            #[test]
-            fn no_match_empty() {
-                assert_eq!(None, $memchr(b'a', b""));
-            }
-
-            #[test]
-            fn no_match() {
-                assert_eq!(None, $memchr(b'a', b"xyz"));
-            }
-
-            #[test]
-            fn qc_never_fail() {
-                fn prop(needle: u8, haystack: Vec<u8>) -> bool {
-                    $memchr(needle, &haystack); true
-                }
-                quickcheck::quickcheck(prop as fn(u8, Vec<u8>) -> bool);
-            }
-
-            #[test]
-            fn matches_one_reversed() {
-                assert_eq!(Some(0), $memrchr(b'a', b"a"));
-            }
-
-            #[test]
-            fn matches_begin_reversed() {
-                assert_eq!(Some(3), $memrchr(b'a', b"aaaa"));
-            }
-
-            #[test]
-            fn matches_end_reversed() {
-                assert_eq!(Some(0), $memrchr(b'z', b"zaaaa"));
-            }
-
-            #[test]
-            fn matches_nul_reversed() {
-                assert_eq!(Some(4), $memrchr(b'\x00', b"aaaa\x00"));
-            }
-
-            #[test]
-            fn matches_past_nul_reversed() {
-                assert_eq!(Some(0), $memrchr(b'z', b"z\x00aaaa"));
-            }
-
-            #[test]
-            fn no_match_empty_reversed() {
-                assert_eq!(None, $memrchr(b'a', b""));
-            }
-
-            #[test]
-            fn no_match_reversed() {
-                assert_eq!(None, $memrchr(b'a', b"xyz"));
-            }
-
-            #[test]
-            fn qc_never_fail_reversed() {
-                fn prop(needle: u8, haystack: Vec<u8>) -> bool {
-                    $memrchr(needle, &haystack); true
-                }
-                quickcheck::quickcheck(prop as fn(u8, Vec<u8>) -> bool);
-            }
-
-            #[test]
-            fn qc_correct_memchr() {
-                fn prop(v: Vec<u8>, offset: u8) -> bool {
-                    // test all pointer alignments
-                    let uoffset = (offset & 0xF) as usize;
-                    let data = if uoffset <= v.len() {
-                        &v[uoffset..]
-                    } else {
-                        &v[..]
-                    };
-                    for byte in 0..256u32 {
-                        let byte = byte as u8;
-                        let pos = data.iter().position(|elt| *elt == byte);
-                        if $memchr(byte, &data) != pos {
-                            return false;
-                        }
-                    }
-                    true
-                }
-                quickcheck::quickcheck(prop as fn(Vec<u8>, u8) -> bool);
-            }
-
-            #[test]
-            fn qc_correct_memrchr() {
-                fn prop(v: Vec<u8>, offset: u8) -> bool {
-                    // test all pointer alignments
-                    let uoffset = (offset & 0xF) as usize;
-                    let data = if uoffset <= v.len() {
-                        &v[uoffset..]
-                    } else {
-                        &v[..]
-                    };
-                    for byte in 0..256u32 {
-                        let byte = byte as u8;
-                        let pos = data.iter().rposition(|elt| *elt == byte);
-                        if $memrchr(byte, &data) != pos {
-                            return false;
-                        }
-                    }
-                    true
-                }
-                quickcheck::quickcheck(prop as fn(Vec<u8>, u8) -> bool);
-            }
-            }
-        }
+/// Search for the last occurrence of a byte in a slice.
+///
+/// This returns the index corresponding to the last occurrence of `needle` in
+/// `haystack`, or `None` if one is not found.
+///
+/// While this is operationally the same as something like
+/// `haystack.iter().rposition(|&b| b == needle)`, `memrchr` will use a highly
+/// optimized routine that can be up to an order of magnitude faster in some
+/// cases.
+///
+/// # Example
+///
+/// This shows how to find the last position of a byte in a byte string.
+///
+/// ```
+/// use memchr::memrchr;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr(b'o', haystack), Some(17));
+/// ```
+#[inline]
+pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memrchr(n1, haystack)
     }
 
-    memchr_tests! { native, ::memchr, ::memrchr }
-    memchr_tests! { fallback, ::fallback::memchr, ::fallback::memrchr }
-
-    #[test]
-    fn memchr2_matches_one() {
-        assert_eq!(Some(0), memchr2(b'a', b'b', b"a"));
-        assert_eq!(Some(0), memchr2(b'a', b'b', b"b"));
-        assert_eq!(Some(0), memchr2(b'b', b'a', b"a"));
-        assert_eq!(Some(0), memchr2(b'b', b'a', b"b"));
-    }
-
-    #[test]
-    fn memchr2_matches_begin() {
-        assert_eq!(Some(0), memchr2(b'a', b'b', b"aaaa"));
-        assert_eq!(Some(0), memchr2(b'a', b'b', b"bbbb"));
-    }
-
-    #[test]
-    fn memchr2_matches_end() {
-        assert_eq!(Some(4), memchr2(b'z', b'y', b"aaaaz"));
-        assert_eq!(Some(4), memchr2(b'z', b'y', b"aaaay"));
-    }
-
-    #[test]
-    fn memchr2_matches_nul() {
-        assert_eq!(Some(4), memchr2(b'\x00', b'z', b"aaaa\x00"));
-        assert_eq!(Some(4), memchr2(b'z', b'\x00', b"aaaa\x00"));
-    }
-
-    #[test]
-    fn memchr2_matches_past_nul() {
-        assert_eq!(Some(5), memchr2(b'z', b'y', b"aaaa\x00z"));
-        assert_eq!(Some(5), memchr2(b'y', b'z', b"aaaa\x00z"));
-    }
-
-    #[test]
-    fn memchr2_no_match_empty() {
-        assert_eq!(None, memchr2(b'a', b'b', b""));
-        assert_eq!(None, memchr2(b'b', b'a', b""));
-    }
-
-    #[test]
-    fn memchr2_no_match() {
-        assert_eq!(None, memchr2(b'a', b'b', b"xyz"));
-    }
-
-    #[test]
-    fn qc_never_fail_memchr2() {
-        fn prop(needle1: u8, needle2: u8, haystack: Vec<u8>) -> bool {
-            memchr2(needle1, needle2, &haystack);
-            true
-        }
-        quickcheck::quickcheck(prop as fn(u8, u8, Vec<u8>) -> bool);
+    #[cfg(all(
+        all(memchr_libc, target_os = "linux"),
+        not(all(target_arch = "x86_64", memchr_runtime_simd))
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        c::memrchr(n1, haystack)
     }
 
-    #[test]
-    fn memchr3_matches_one() {
-        assert_eq!(Some(0), memchr3(b'a', b'b', b'c', b"a"));
-        assert_eq!(Some(0), memchr3(b'a', b'b', b'c', b"b"));
-        assert_eq!(Some(0), memchr3(b'a', b'b', b'c', b"c"));
-    }
-
-    #[test]
-    fn memchr3_matches_begin() {
-        assert_eq!(Some(0), memchr3(b'a', b'b', b'c', b"aaaa"));
-        assert_eq!(Some(0), memchr3(b'a', b'b', b'c', b"bbbb"));
-        assert_eq!(Some(0), memchr3(b'a', b'b', b'c', b"cccc"));
-    }
-
-    #[test]
-    fn memchr3_matches_end() {
-        assert_eq!(Some(4), memchr3(b'z', b'y', b'x', b"aaaaz"));
-        assert_eq!(Some(4), memchr3(b'z', b'y', b'x', b"aaaay"));
-        assert_eq!(Some(4), memchr3(b'z', b'y', b'x', b"aaaax"));
-    }
-
-    #[test]
-    fn memchr3_matches_nul() {
-        assert_eq!(Some(4), memchr3(b'\x00', b'z', b'y', b"aaaa\x00"));
-        assert_eq!(Some(4), memchr3(b'z', b'\x00', b'y', b"aaaa\x00"));
-        assert_eq!(Some(4), memchr3(b'z', b'y', b'\x00', b"aaaa\x00"));
-    }
-
-    #[test]
-    fn memchr3_matches_past_nul() {
-        assert_eq!(Some(5), memchr3(b'z', b'y', b'x', b"aaaa\x00z"));
-        assert_eq!(Some(5), memchr3(b'y', b'z', b'x', b"aaaa\x00z"));
-        assert_eq!(Some(5), memchr3(b'y', b'x', b'z', b"aaaa\x00z"));
-    }
-
-    #[test]
-    fn memchr3_no_match_empty() {
-        assert_eq!(None, memchr3(b'a', b'b', b'c', b""));
-        assert_eq!(None, memchr3(b'b', b'a', b'c', b""));
-        assert_eq!(None, memchr3(b'c', b'b', b'a', b""));
-    }
-
-    #[test]
-    fn memchr3_no_match() {
-        assert_eq!(None, memchr3(b'a', b'b', b'c', b"xyz"));
-    }
-
-    // return an iterator of the 0-based indices of haystack that match the
-    // needle
-    fn positions1<'a>(needle: u8, haystack: &'a [u8])
-        -> Box<DoubleEndedIterator<Item=usize> + 'a>
-    {
-        Box::new(haystack.iter()
-                         .enumerate()
-                         .filter(move |&(_, &elt)| elt == needle)
-                         .map(|t| t.0))
-    }
-
-    fn positions2<'a>(needle1: u8, needle2: u8, haystack: &'a [u8])
-        -> Box<DoubleEndedIterator<Item=usize> + 'a>
-    {
-        Box::new(haystack
-            .iter()
-            .enumerate()
-            .filter(move |&(_, &elt)| elt == needle1 || elt == needle2)
-            .map(|t| t.0))
+    #[cfg(all(
+        not(all(memchr_libc, target_os = "linux")),
+        not(all(target_arch = "x86_64", memchr_runtime_simd))
+    ))]
+    #[inline(always)]
+    fn imp(n1: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memrchr(n1, haystack)
     }
 
-    fn positions3<'a>(
-        needle1: u8,
-        needle2: u8,
-        needle3: u8,
-        haystack: &'a [u8],
-    ) -> Box<DoubleEndedIterator<Item=usize> + 'a> {
-        Box::new(haystack
-            .iter()
-            .enumerate()
-            .filter(move |&(_, &elt)| {
-                elt == needle1 || elt == needle2 || elt == needle3
-            })
-            .map(|t| t.0))
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle, haystack)
     }
-
-    #[test]
-    fn memchr_iter() {
-        let haystack = b"aaaabaaaab";
-        let mut memchr_iter = Memchr::new(b'b', haystack);
-        let first = memchr_iter.next();
-        let second = memchr_iter.next();
-        let third = memchr_iter.next();
-
-        let mut answer_iter = positions1(b'b', haystack);
-        assert_eq!(answer_iter.next(), first);
-        assert_eq!(answer_iter.next(), second);
-        assert_eq!(answer_iter.next(), third);
-    }
+}
 
-    #[test]
-    fn memchr2_iter() {
-        let haystack = b"axxb";
-        let mut memchr_iter = Memchr2::new(b'a', b'b', haystack);
-        let first = memchr_iter.next();
-        let second = memchr_iter.next();
-        let third = memchr_iter.next();
-
-        let mut answer_iter = positions2(b'a', b'b', haystack);
-        assert_eq!(answer_iter.next(), first);
-        assert_eq!(answer_iter.next(), second);
-        assert_eq!(answer_iter.next(), third);
-    }
-
-    #[test]
-    fn memchr3_iter() {
-        let haystack = b"axxbc";
-        let mut memchr_iter = Memchr3::new(b'a', b'b', b'c', haystack);
-        let first = memchr_iter.next();
-        let second = memchr_iter.next();
-        let third = memchr_iter.next();
-        let fourth = memchr_iter.next();
-
-        let mut answer_iter = positions3(b'a', b'b', b'c', haystack);
-        assert_eq!(answer_iter.next(), first);
-        assert_eq!(answer_iter.next(), second);
-        assert_eq!(answer_iter.next(), third);
-        assert_eq!(answer_iter.next(), fourth);
+/// Like `memrchr`, but searches for two bytes instead of one.
+#[inline]
+pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> {
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memrchr2(n1, n2, haystack)
     }
 
-    #[test]
-    fn memchr_reverse_iter() {
-        let haystack = b"aaaabaaaabaaaab";
-        let mut memchr_iter = Memchr::new(b'b', haystack);
-        let first = memchr_iter.next();
-        let second = memchr_iter.next_back();
-        let third = memchr_iter.next();
-        let fourth = memchr_iter.next_back();
-
-        let mut answer_iter = positions1(b'b', haystack);
-        assert_eq!(answer_iter.next(), first);
-        assert_eq!(answer_iter.next_back(), second);
-        assert_eq!(answer_iter.next(), third);
-        assert_eq!(answer_iter.next_back(), fourth);
-    }
-
-    #[test]
-    fn memrchr_iter(){
-        let haystack = b"aaaabaaaabaaaab";
-        let mut memchr_iter = Memchr::new(b'b', haystack);
-        let first = memchr_iter.next_back();
-        let second = memchr_iter.next_back();
-        let third = memchr_iter.next_back();
-        let fourth = memchr_iter.next_back();
-
-        let mut answer_iter = positions1(b'b', haystack);
-        assert_eq!(answer_iter.next_back(), first);
-        assert_eq!(answer_iter.next_back(), second);
-        assert_eq!(answer_iter.next_back(), third);
-        assert_eq!(answer_iter.next_back(), fourth);
-
-    }
-
-    #[test]
-    fn qc_never_fail_memchr3() {
-        fn prop(
-            needle1: u8,
-            needle2: u8,
-            needle3: u8,
-            haystack: Vec<u8>,
-        ) -> bool {
-            memchr3(needle1, needle2, needle3, &haystack);
-            true
-        }
-        quickcheck::quickcheck(prop as fn(u8, u8, u8, Vec<u8>) -> bool);
-    }
-
-    #[test]
-    fn qc_correct_memchr() {
-        fn prop(v: Vec<u8>, offset: u8) -> bool {
-            // test all pointer alignments
-            let uoffset = (offset & 0xF) as usize;
-            let data = if uoffset <= v.len() {
-                &v[uoffset..]
-            } else {
-                &v[..]
-            };
-            for byte in 0..256u32 {
-                let byte = byte as u8;
-                let pos = data.iter().position(|elt| *elt == byte);
-                if memchr(byte, &data) != pos {
-                    return false;
-                }
-            }
-            true
-        }
-        quickcheck::quickcheck(prop as fn(Vec<u8>, u8) -> bool);
+    #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memrchr2(n1, n2, haystack)
     }
 
-    #[test]
-    fn qc_correct_memrchr() {
-        fn prop(v: Vec<u8>, offset: u8) -> bool {
-            // test all pointer alignments
-            let uoffset = (offset & 0xF) as usize;
-            let data = if uoffset <= v.len() {
-                &v[uoffset..]
-            } else {
-                &v[..]
-            };
-            for byte in 0..256u32 {
-                let byte = byte as u8;
-                let pos = data.iter().rposition(|elt| *elt == byte);
-                if memrchr(byte, &data) != pos {
-                    return false;
-                }
-            }
-            true
-        }
-        quickcheck::quickcheck(prop as fn(Vec<u8>, u8) -> bool);
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, haystack)
     }
+}
 
-    #[test]
-    fn qc_correct_memchr2() {
-        fn prop(v: Vec<u8>, offset: u8) -> bool {
-            // test all pointer alignments
-            let uoffset = (offset & 0xF) as usize;
-            let data = if uoffset <= v.len() {
-                &v[uoffset..]
-            } else {
-                &v[..]
-            };
-            for b1 in 0..256u32 {
-                for b2 in 0..256u32 {
-                    let (b1, b2) = (b1 as u8, b2 as u8);
-                    let expected = data
-                        .iter()
-                        .position(|&b| b == b1 || b == b2);
-                    let got = memchr2(b1, b2, &data);
-                    if expected != got {
-                        return false;
-                    }
-                }
-            }
-            true
-        }
-        quickcheck::quickcheck(prop as fn(Vec<u8>, u8) -> bool);
+/// Like `memrchr`, but searches for three bytes instead of one.
+#[inline]
+pub fn memrchr3(
+    needle1: u8,
+    needle2: u8,
+    needle3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        x86::memrchr3(n1, n2, n3, haystack)
     }
 
-    // take items from a DEI, taking front for each true and back for each
-    // false. Return a vector with the concatenation of the fronts and the
-    // reverse of the backs.
-    fn double_ended_take<I, J>(mut iter: I, take_side: J) -> Vec<I::Item>
-        where I: DoubleEndedIterator,
-              J: Iterator<Item=bool>,
-    {
-        let mut found_front = Vec::new();
-        let mut found_back = Vec::new();
-
-        for take_front in take_side {
-            if take_front {
-                if let Some(pos) = iter.next() {
-                    found_front.push(pos);
-                } else {
-                    break;
-                }
-            } else {
-                if let Some(pos) = iter.next_back() {
-                    found_back.push(pos);
-                } else {
-                    break;
-                }
-            };
-        }
-
-        let mut all_found = found_front;
-        all_found.extend(found_back.into_iter().rev());
-        all_found
+    #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))]
+    #[inline(always)]
+    fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+        fallback::memrchr3(n1, n2, n3, haystack)
     }
 
-
-    quickcheck! {
-        fn qc_memchr_double_ended_iter(needle: u8, data: Vec<u8>,
-                                       take_side: Vec<bool>) -> bool
-        {
-            // make nonempty
-            let mut take_side = take_side;
-            if take_side.is_empty() { take_side.push(true) };
-
-            let iter = Memchr::new(needle, &data);
-            let all_found = double_ended_take(
-                iter, take_side.iter().cycle().cloned());
-
-            all_found.iter().cloned().eq(positions1(needle, &data))
-        }
-
-        fn qc_memchr1_iter(data: Vec<u8>) -> bool {
-            let needle = 0;
-            let answer = positions1(needle, &data);
-            answer.eq(Memchr::new(needle, &data))
-        }
-
-        fn qc_memchr1_rev_iter(data: Vec<u8>) -> bool {
-            let needle = 0;
-            let answer = positions1(needle, &data);
-            answer.rev().eq(Memchr::new(needle, &data).rev())
-        }
-
-        fn qc_memchr2_iter(data: Vec<u8>) -> bool {
-            let needle1 = 0;
-            let needle2 = 1;
-            let answer = positions2(needle1, needle2, &data);
-            answer.eq(Memchr2::new(needle1, needle2, &data))
-        }
-
-        fn qc_memchr3_iter(data: Vec<u8>) -> bool {
-            let needle1 = 0;
-            let needle2 = 1;
-            let needle3 = 2;
-            let answer = positions3(needle1, needle2, needle3, &data);
-            answer.eq(Memchr3::new(needle1, needle2, needle3, &data))
-        }
-
-        fn qc_memchr1_iter_size_hint(data: Vec<u8>) -> bool {
-            // test that the size hint is within reasonable bounds
-            let needle = 0;
-            let mut iter = Memchr::new(needle, &data);
-            let mut real_count = data
-                .iter()
-                .filter(|&&elt| elt == needle)
-                .count();
-
-            while let Some(index) = iter.next() {
-                real_count -= 1;
-                let (lower, upper) = iter.size_hint();
-                assert!(lower <= real_count);
-                assert!(upper.unwrap() >= real_count);
-                assert!(upper.unwrap() <= data.len() - index);
-            }
-            true
-        }
+    if haystack.is_empty() {
+        None
+    } else {
+        imp(needle1, needle2, needle3, haystack)
     }
 }
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/naive.rs
@@ -0,0 +1,37 @@
+#![allow(dead_code)]
+
+pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    haystack
+        .iter()
+        .position(|&b| b == n1)
+}
+
+pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    haystack
+        .iter()
+        .position(|&b| b == n1 || b == n2)
+}
+
+pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    haystack
+        .iter()
+        .position(|&b| b == n1 || b == n2 || b == n3)
+}
+
+pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    haystack
+        .iter()
+        .rposition(|&b| b == n1)
+}
+
+pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    haystack
+        .iter()
+        .rposition(|&b| b == n1 || b == n2)
+}
+
+pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    haystack
+        .iter()
+        .rposition(|&b| b == n1 || b == n2 || b == n3)
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/tests/iter.rs
@@ -0,0 +1,228 @@
+use tests::memchr_tests;
+use {Memchr, Memchr2, Memchr3};
+
+#[test]
+fn memchr1_iter() {
+    for test in memchr_tests() {
+        test.iter_one(false, Memchr::new);
+    }
+}
+
+#[test]
+fn memchr2_iter() {
+    for test in memchr_tests() {
+        test.iter_two(false, Memchr2::new);
+    }
+}
+
+#[test]
+fn memchr3_iter() {
+    for test in memchr_tests() {
+        test.iter_three(false, Memchr3::new);
+    }
+}
+
+#[test]
+fn memrchr1_iter() {
+    for test in memchr_tests() {
+        test.iter_one(true, |n1, corpus| Memchr::new(n1, corpus).rev());
+    }
+}
+
+#[test]
+fn memrchr2_iter() {
+    for test in memchr_tests() {
+        test.iter_two(true, |n1, n2, corpus| {
+            Memchr2::new(n1, n2, corpus).rev()
+        })
+    }
+}
+
+#[test]
+fn memrchr3_iter() {
+    for test in memchr_tests() {
+        test.iter_three(true, |n1, n2, n3, corpus| {
+            Memchr3::new(n1, n2, n3, corpus).rev()
+        })
+    }
+}
+
+quickcheck! {
+    fn qc_memchr_double_ended_iter(
+        needle: u8, data: Vec<u8>, take_side: Vec<bool>
+    ) -> bool {
+        // make nonempty
+        let mut take_side = take_side;
+        if take_side.is_empty() { take_side.push(true) };
+
+        let iter = Memchr::new(needle, &data);
+        let all_found = double_ended_take(
+            iter, take_side.iter().cycle().cloned());
+
+        all_found.iter().cloned().eq(positions1(needle, &data))
+    }
+
+    fn qc_memchr2_double_ended_iter(
+        needle1: u8, needle2: u8, data: Vec<u8>, take_side: Vec<bool>
+    ) -> bool {
+        // make nonempty
+        let mut take_side = take_side;
+        if take_side.is_empty() { take_side.push(true) };
+
+        let iter = Memchr2::new(needle1, needle2, &data);
+        let all_found = double_ended_take(
+            iter, take_side.iter().cycle().cloned());
+
+        all_found.iter().cloned().eq(positions2(needle1, needle2, &data))
+    }
+
+    fn qc_memchr3_double_ended_iter(
+        needle1: u8, needle2: u8, needle3: u8,
+        data: Vec<u8>, take_side: Vec<bool>
+    ) -> bool {
+        // make nonempty
+        let mut take_side = take_side;
+        if take_side.is_empty() { take_side.push(true) };
+
+        let iter = Memchr3::new(needle1, needle2, needle3, &data);
+        let all_found = double_ended_take(
+            iter, take_side.iter().cycle().cloned());
+
+        all_found
+            .iter()
+            .cloned()
+            .eq(positions3(needle1, needle2, needle3, &data))
+    }
+
+    fn qc_memchr1_iter(data: Vec<u8>) -> bool {
+        let needle = 0;
+        let answer = positions1(needle, &data);
+        answer.eq(Memchr::new(needle, &data))
+    }
+
+    fn qc_memchr1_rev_iter(data: Vec<u8>) -> bool {
+        let needle = 0;
+        let answer = positions1(needle, &data);
+        answer.rev().eq(Memchr::new(needle, &data).rev())
+    }
+
+    fn qc_memchr2_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let answer = positions2(needle1, needle2, &data);
+        answer.eq(Memchr2::new(needle1, needle2, &data))
+    }
+
+    fn qc_memchr2_rev_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let answer = positions2(needle1, needle2, &data);
+        answer.rev().eq(Memchr2::new(needle1, needle2, &data).rev())
+    }
+
+    fn qc_memchr3_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let needle3 = 2;
+        let answer = positions3(needle1, needle2, needle3, &data);
+        answer.eq(Memchr3::new(needle1, needle2, needle3, &data))
+    }
+
+    fn qc_memchr3_rev_iter(data: Vec<u8>) -> bool {
+        let needle1 = 0;
+        let needle2 = 1;
+        let needle3 = 2;
+        let answer = positions3(needle1, needle2, needle3, &data);
+        answer.rev().eq(Memchr3::new(needle1, needle2, needle3, &data).rev())
+    }
+
+    fn qc_memchr1_iter_size_hint(data: Vec<u8>) -> bool {
+        // test that the size hint is within reasonable bounds
+        let needle = 0;
+        let mut iter = Memchr::new(needle, &data);
+        let mut real_count = data
+            .iter()
+            .filter(|&&elt| elt == needle)
+            .count();
+
+        while let Some(index) = iter.next() {
+            real_count -= 1;
+            let (lower, upper) = iter.size_hint();
+            assert!(lower <= real_count);
+            assert!(upper.unwrap() >= real_count);
+            assert!(upper.unwrap() <= data.len() - index);
+        }
+        true
+    }
+}
+
+// take items from a DEI, taking front for each true and back for each false.
+// Return a vector with the concatenation of the fronts and the reverse of the
+// backs.
+fn double_ended_take<I, J>(mut iter: I, take_side: J) -> Vec<I::Item>
+    where I: DoubleEndedIterator,
+          J: Iterator<Item=bool>,
+{
+    let mut found_front = Vec::new();
+    let mut found_back = Vec::new();
+
+    for take_front in take_side {
+        if take_front {
+            if let Some(pos) = iter.next() {
+                found_front.push(pos);
+            } else {
+                break;
+            }
+        } else {
+            if let Some(pos) = iter.next_back() {
+                found_back.push(pos);
+            } else {
+                break;
+            }
+        };
+    }
+
+    let mut all_found = found_front;
+    all_found.extend(found_back.into_iter().rev());
+    all_found
+}
+
+// return an iterator of the 0-based indices of haystack that match the needle
+fn positions1<'a>(
+    n1: u8,
+    haystack: &'a [u8],
+) -> Box<DoubleEndedIterator<Item=usize> + 'a> {
+    let it = haystack
+        .iter()
+        .enumerate()
+        .filter(move |&(_, &b)| b == n1)
+        .map(|t| t.0);
+    Box::new(it)
+}
+
+fn positions2<'a>(
+    n1: u8,
+    n2: u8,
+    haystack: &'a [u8],
+) -> Box<DoubleEndedIterator<Item=usize> + 'a> {
+    let it = haystack
+        .iter()
+        .enumerate()
+        .filter(move |&(_, &b)| b == n1 || b == n2)
+        .map(|t| t.0);
+    Box::new(it)
+}
+
+fn positions3<'a>(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &'a [u8],
+) -> Box<DoubleEndedIterator<Item=usize> + 'a> {
+    let it = haystack
+        .iter()
+        .enumerate()
+        .filter(move |&(_, &b)| b == n1 || b == n2 || b == n3)
+        .map(|t| t.0);
+    Box::new(it)
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/tests/memchr.rs
@@ -0,0 +1,131 @@
+use fallback;
+use naive;
+use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
+use tests::memchr_tests;
+
+#[test]
+fn memchr1_find() {
+    for test in memchr_tests() {
+        test.one(false, memchr);
+    }
+}
+
+#[test]
+fn memchr1_fallback_find() {
+    for test in memchr_tests() {
+        test.one(false, fallback::memchr);
+    }
+}
+
+#[test]
+fn memchr2_find() {
+    for test in memchr_tests() {
+        test.two(false, memchr2);
+    }
+}
+
+#[test]
+fn memchr2_fallback_find() {
+    for test in memchr_tests() {
+        test.two(false, fallback::memchr2);
+    }
+}
+
+#[test]
+fn memchr3_find() {
+    for test in memchr_tests() {
+        test.three(false, memchr3);
+    }
+}
+
+#[test]
+fn memchr3_fallback_find() {
+    for test in memchr_tests() {
+        test.three(false, fallback::memchr3);
+    }
+}
+
+#[test]
+fn memrchr1_find() {
+    for test in memchr_tests() {
+        test.one(true, memrchr);
+    }
+}
+
+#[test]
+fn memrchr1_fallback_find() {
+    for test in memchr_tests() {
+        test.one(true, fallback::memrchr);
+    }
+}
+
+#[test]
+fn memrchr2_find() {
+    for test in memchr_tests() {
+        test.two(true, memrchr2);
+    }
+}
+
+#[test]
+fn memrchr2_fallback_find() {
+    for test in memchr_tests() {
+        test.two(true, fallback::memrchr2);
+    }
+}
+
+#[test]
+fn memrchr3_find() {
+    for test in memchr_tests() {
+        test.three(true, memrchr3);
+    }
+}
+
+#[test]
+fn memrchr3_fallback_find() {
+    for test in memchr_tests() {
+        test.three(true, fallback::memrchr3);
+    }
+}
+
+quickcheck! {
+    fn qc_memchr1_matches_naive(n1: u8, corpus: Vec<u8>) -> bool {
+        memchr(n1, &corpus) == naive::memchr(n1, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memchr2_matches_naive(n1: u8, n2: u8, corpus: Vec<u8>) -> bool {
+        memchr2(n1, n2, &corpus) == naive::memchr2(n1, n2, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memchr3_matches_naive(
+        n1: u8, n2: u8, n3: u8,
+        corpus: Vec<u8>
+    ) -> bool {
+        memchr3(n1, n2, n3, &corpus) == naive::memchr3(n1, n2, n3, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memrchr1_matches_naive(n1: u8, corpus: Vec<u8>) -> bool {
+        memrchr(n1, &corpus) == naive::memrchr(n1, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memrchr2_matches_naive(n1: u8, n2: u8, corpus: Vec<u8>) -> bool {
+        memrchr2(n1, n2, &corpus) == naive::memrchr2(n1, n2, &corpus)
+    }
+}
+
+quickcheck! {
+    fn qc_memrchr3_matches_naive(
+        n1: u8, n2: u8, n3: u8,
+        corpus: Vec<u8>
+    ) -> bool {
+        memrchr3(n1, n2, n3, &corpus) == naive::memrchr3(n1, n2, n3, &corpus)
+    }
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/tests/mod.rs
@@ -0,0 +1,422 @@
+use std::iter::repeat;
+
+mod iter;
+mod memchr;
+
+#[cfg(target_endian = "little")]
+#[test]
+fn byte_order() {
+    eprintln!("LITTLE ENDIAN");
+}
+
+#[cfg(target_endian = "big")]
+#[test]
+fn byte_order() {
+    eprintln!("BIG ENDIAN");
+}
+
+/// Create a sequence of tests that should be run by memchr implementations.
+fn memchr_tests() -> Vec<MemchrTest> {
+    let mut tests = Vec::new();
+    for statict in MEMCHR_TESTS {
+        assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
+        assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
+        assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
+        assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
+
+        let t = MemchrTest {
+            corpus: statict.corpus.to_string(),
+            needles: statict.needles.to_vec(),
+            positions: statict.positions.to_vec(),
+        };
+        tests.push(t.clone());
+        tests.extend(t.expand());
+    }
+    tests
+}
+
+/// A set of tests for memchr-like functions.
+///
+/// These tests mostly try to cover the short string cases. We cover the longer
+/// string cases via the benchmarks (which are tests themselves), via
+/// quickcheck tests and via automatic expansion of each test case (by
+/// increasing the corpus size). Finally, we cover different alignment cases
+/// in the tests by varying the starting point of the slice.
+const MEMCHR_TESTS: &[MemchrTestStatic] = &[
+    // one needle (applied to memchr + memchr2 + memchr3)
+    MemchrTestStatic {
+        corpus: "a",
+        needles: &[b'a'],
+        positions: &[0],
+    },
+    MemchrTestStatic {
+        corpus: "aa",
+        needles: &[b'a'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "aaa",
+        needles: &[b'a'],
+        positions: &[0, 1, 2],
+    },
+    MemchrTestStatic {
+        corpus: "",
+        needles: &[b'a'],
+        positions: &[],
+    },
+    MemchrTestStatic {
+        corpus: "z",
+        needles: &[b'a'],
+        positions: &[],
+    },
+    MemchrTestStatic {
+        corpus: "zz",
+        needles: &[b'a'],
+        positions: &[],
+    },
+    MemchrTestStatic {
+        corpus: "zza",
+        needles: &[b'a'],
+        positions: &[2],
+    },
+    MemchrTestStatic {
+        corpus: "zaza",
+        needles: &[b'a'],
+        positions: &[1, 3],
+    },
+    MemchrTestStatic {
+        corpus: "zzza",
+        needles: &[b'a'],
+        positions: &[3],
+    },
+    MemchrTestStatic {
+        corpus: "\x00a",
+        needles: &[b'a'],
+        positions: &[1],
+    },
+    MemchrTestStatic {
+        corpus: "\x00",
+        needles: &[b'\x00'],
+        positions: &[0],
+    },
+    MemchrTestStatic {
+        corpus: "\x00\x00",
+        needles: &[b'\x00'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "\x00a\x00",
+        needles: &[b'\x00'],
+        positions: &[0, 2],
+    },
+    MemchrTestStatic {
+        corpus: "zzzzzzzzzzzzzzzza",
+        needles: &[b'a'],
+        positions: &[16],
+    },
+    MemchrTestStatic {
+        corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
+        needles: &[b'a'],
+        positions: &[32],
+    },
+
+    // two needles (applied to memchr2 + memchr3)
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'a', b'z'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'a', b'z'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'x', b'y'],
+        positions: &[],
+    },
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'a', b'y'],
+        positions: &[0],
+    },
+    MemchrTestStatic {
+        corpus: "az",
+        needles: &[b'x', b'z'],
+        positions: &[1],
+    },
+    MemchrTestStatic {
+        corpus: "yyyyaz",
+        needles: &[b'a', b'z'],
+        positions: &[4, 5],
+    },
+    MemchrTestStatic {
+        corpus: "yyyyaz",
+        needles: &[b'z', b'a'],
+        positions: &[4, 5],
+    },
+
+    // three needles (applied to memchr3)
+    MemchrTestStatic {
+        corpus: "xyz",
+        needles: &[b'x', b'y', b'z'],
+        positions: &[0, 1, 2],
+    },
+    MemchrTestStatic {
+        corpus: "zxy",
+        needles: &[b'x', b'y', b'z'],
+        positions: &[0, 1, 2],
+    },
+    MemchrTestStatic {
+        corpus: "zxy",
+        needles: &[b'x', b'a', b'z'],
+        positions: &[0, 1],
+    },
+    MemchrTestStatic {
+        corpus: "zxy",
+        needles: &[b't', b'a', b'z'],
+        positions: &[0],
+    },
+    MemchrTestStatic {
+        corpus: "yxz",
+        needles: &[b't', b'a', b'z'],
+        positions: &[2],
+    },
+];
+
+/// A description of a test on a memchr like function.
+#[derive(Clone, Debug)]
+struct MemchrTest {
+    /// The thing to search. We use `&str` instead of `&[u8]` because they
+    /// are nicer to write in tests, and we don't miss much since memchr
+    /// doesn't care about UTF-8.
+    ///
+    /// Corpora cannot contain either '%' or '#'. We use these bytes when
+    /// expanding test cases into many test cases, and we assume they are not
+    /// used. If they are used, `memchr_tests` will panic.
+    corpus: String,
+    /// The needles to search for. This is intended to be an "alternation" of
+    /// needles. The number of needles may cause this test to be skipped for
+    /// some memchr variants. For example, a test with 2 needles cannot be used
+    /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
+    /// However, a test with only 1 needle can be used to test all of `memchr`,
+    /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
+    /// bytes that we never used in the corpus (such as '#').
+    needles: Vec<u8>,
+    /// The positions expected to match for all of the needles.
+    positions: Vec<usize>,
+}
+
+/// Like MemchrTest, but easier to define as a constant.
+#[derive(Clone, Debug)]
+struct MemchrTestStatic {
+    corpus: &'static str,
+    needles: &'static [u8],
+    positions: &'static [usize],
+}
+
+impl MemchrTest {
+    fn one<F: Fn(u8, &[u8]) -> Option<usize>>(
+        &self,
+        reverse: bool,
+        f: F,
+    ) {
+        let needles = match self.needles(1) {
+            None => return,
+            Some(needles) => needles,
+        };
+        // We test different alignments here. Since some implementations use
+        // AVX2, which can read 32 bytes at a time, we test at least that.
+        // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
+        // (avx) bytes at a time, so we include that in our offsets as well.
+        //
+        // You might think this would cause most needles to not be found, but
+        // we actually expand our tests to include corpus sizes all the way up
+        // to >500 bytes, so we should exericse most branches.
+        for align in 0..130 {
+            let corpus = self.corpus(align);
+            assert_eq!(
+                self.positions(align, reverse).get(0).cloned(),
+                f(needles[0], corpus.as_bytes()),
+                "search for {:?} failed in: {:?} (len: {}, alignment: {})",
+                needles[0] as char,
+                corpus,
+                corpus.len(),
+                align
+            );
+        }
+    }
+
+    fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(
+        &self,
+        reverse: bool,
+        f: F,
+    ) {
+        let needles = match self.needles(2) {
+            None => return,
+            Some(needles) => needles,
+        };
+        for align in 0..130 {
+            let corpus = self.corpus(align);
+            assert_eq!(
+                self.positions(align, reverse).get(0).cloned(),
+                f(needles[0], needles[1], corpus.as_bytes()),
+                "search for {:?}|{:?} failed in: {:?} \
+                 (len: {}, alignment: {})",
+                needles[0] as char,
+                needles[1] as char,
+                corpus,
+                corpus.len(),
+                align
+            );
+        }
+    }
+
+    fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
+        &self,
+        reverse: bool,
+        f: F,
+    ) {
+        let needles = match self.needles(3) {
+            None => return,
+            Some(needles) => needles,
+        };
+        for align in 0..130 {
+            let corpus = self.corpus(align);
+            assert_eq!(
+                self.positions(align, reverse).get(0).cloned(),
+                f(needles[0], needles[1], needles[2], corpus.as_bytes()),
+                "search for {:?}|{:?}|{:?} failed in: {:?} \
+                 (len: {}, alignment: {})",
+                needles[0] as char,
+                needles[1] as char,
+                needles[2] as char,
+                corpus,
+                corpus.len(),
+                align
+            );
+        }
+    }
+
+    fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
+    where F: FnOnce(u8, &'a [u8]) -> I,
+          I: Iterator<Item=usize>
+    {
+        if let Some(ns) = self.needles(1) {
+            self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
+        }
+    }
+
+    fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
+    where F: FnOnce(u8, u8, &'a [u8]) -> I,
+          I: Iterator<Item=usize>
+    {
+        if let Some(ns) = self.needles(2) {
+            self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
+        }
+    }
+
+    fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
+    where F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
+          I: Iterator<Item=usize>
+    {
+        if let Some(ns) = self.needles(3) {
+            self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
+        }
+    }
+
+    /// Test that the positions yielded by the given iterator match the
+    /// positions in this test. If reverse is true, then reverse the positions
+    /// before comparing them.
+    fn iter<I: Iterator<Item=usize>>(&self, reverse: bool, it: I) {
+        assert_eq!(
+            self.positions(0, reverse),
+            it.collect::<Vec<usize>>(),
+            r"search for {:?} failed in: {:?}",
+            self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
+            self.corpus
+        );
+    }
+
+    /// Expand this test into many variations of the same test.
+    ///
+    /// In particular, this will generate more tests with larger corpus sizes.
+    /// The expected positions are updated to maintain the integrity of the
+    /// test.
+    ///
+    /// This is important in testing a memchr implementation, because there are
+    /// often different cases depending on the length of the corpus.
+    ///
+    /// Note that we extend the corpus by adding `%` bytes, which we
+    /// don't otherwise use as a needle.
+    fn expand(&self) -> Vec<MemchrTest> {
+        let mut more = Vec::new();
+
+        // Add bytes to the start of the corpus.
+        for i in 1..515 {
+            let mut t = self.clone();
+            let mut new_corpus: String = repeat('%').take(i).collect();
+            new_corpus.push_str(&t.corpus);
+            t.corpus = new_corpus;
+            t.positions = t.positions.into_iter().map(|p| p + i).collect();
+            more.push(t);
+        }
+        // Add bytes to the end of the corpus.
+        for i in 1..515 {
+            let mut t = self.clone();
+            let mut padding: String = repeat('%').take(i).collect();
+            t.corpus.push_str(&padding);
+            more.push(t);
+        }
+
+        more
+    }
+
+    /// Return the corpus at the given alignment.
+    ///
+    /// If the alignment exceeds the length of the corpus, then this returns
+    /// an empty slice.
+    fn corpus(&self, align: usize) -> &str {
+        self.corpus.get(align..).unwrap_or("")
+    }
+
+    /// Return exactly `count` needles from this test. If this test has less
+    /// than `count` needles, then add `#` until the number of needles
+    /// matches `count`. If this test has more than `count` needles, then
+    /// return `None` (because there is no way to use this test data for a
+    /// search using fewer needles).
+    fn needles(&self, count: usize) -> Option<Vec<u8>> {
+        if self.needles.len() > count {
+            return None;
+        }
+
+        let mut needles = self.needles.to_vec();
+        for _ in needles.len()..count {
+            // we assume # is never used in tests.
+            needles.push(b'#');
+        }
+        Some(needles)
+    }
+
+    /// Return the positions in this test, reversed if `reverse` is true.
+    ///
+    /// If alignment is given, then all positions greater than or equal to that
+    /// alignment are offset by the alignment. Positions less than the
+    /// alignment are dropped.
+    fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
+        let positions =
+            if reverse {
+                let mut positions = self.positions.to_vec();
+                positions.reverse();
+                positions
+            } else {
+                self.positions.to_vec()
+            };
+        positions
+            .into_iter()
+            .filter(|&p| p >= align)
+            .map(|p| p - align)
+            .collect()
+    }
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/x86/avx.rs
@@ -0,0 +1,739 @@
+use core::arch::x86_64::*;
+use core::cmp;
+use core::mem::size_of;
+
+use x86::sse2;
+
+const VECTOR_SIZE: usize = size_of::<__m256i>();
+const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+// The number of bytes to loop at in one iteration of memchr2/memrchr2 and
+// memchr3/memrchr3. There was no observable difference between 128 and 64
+// bytes in benchmarks. memchr3 in particular only gets a very slight speed up
+// from the loop unrolling.
+const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    // For a high level explanation for how this algorithm works, see the
+    // sse2 implementation. The avx implementation here is the same, but with
+    // 256-bit vectors instead of 128-bit vectors.
+
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        // For small haystacks, defer to the SSE2 implementation. Codegen
+        // suggests this completely avoids touching the AVX vectors.
+        return sse2::memchr(n1, haystack);
+    }
+
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+    if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
+        let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
+        let eqa = _mm256_cmpeq_epi8(vn1, a);
+        let eqb = _mm256_cmpeq_epi8(vn1, b);
+        let eqc = _mm256_cmpeq_epi8(vn1, c);
+        let eqd = _mm256_cmpeq_epi8(vn1, d);
+        let or1 = _mm256_or_si256(eqa, eqb);
+        let or2 = _mm256_or_si256(eqc, eqd);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask = _mm256_movemask_epi8(eqa);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqd);
+            debug_assert!(mask != 0);
+            return Some(at + forward_pos(mask));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+        if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search1(start_ptr, end_ptr, ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + forward_pos2(mask1, mask2));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            return Some(at + forward_pos2(mask1, mask2));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memchr3(
+    n1: u8, n2: u8, n3: u8,
+    haystack: &[u8]
+) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let vn3 = _mm256_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm256_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm256_cmpeq_epi8(vn3, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(eqa3, eqb3);
+        let or4 = _mm256_or_si256(or1, or2);
+        let or5 = _mm256_or_si256(or3, or4);
+        if _mm256_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            let mask3 = _mm256_movemask_epi8(eqa3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            let mask3 = _mm256_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            let mask3 = _mm256_movemask_epi8(eqb3);
+            return Some(at + forward_pos3(mask1, mask2, mask3));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
+        let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
+        let eqa = _mm256_cmpeq_epi8(vn1, a);
+        let eqb = _mm256_cmpeq_epi8(vn1, b);
+        let eqc = _mm256_cmpeq_epi8(vn1, c);
+        let eqd = _mm256_cmpeq_epi8(vn1, d);
+        let or1 = _mm256_or_si256(eqa, eqb);
+        let or2 = _mm256_or_si256(eqc, eqd);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
+            let mask = _mm256_movemask_epi8(eqd);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm256_movemask_epi8(eqa);
+            debug_assert!(mask != 0);
+            return Some(at + reverse_pos(mask));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(or1, or2);
+        if _mm256_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + reverse_pos2(mask1, mask2));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            return Some(at + reverse_pos2(mask1, mask2));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn memrchr3(
+    n1: u8, n2: u8, n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm256_set1_epi8(n1 as i8);
+    let vn2 = _mm256_set1_epi8(n2 as i8);
+    let vn3 = _mm256_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm256_load_si256(ptr as *const __m256i);
+        let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
+        let eqa1 = _mm256_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm256_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm256_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm256_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm256_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm256_cmpeq_epi8(vn3, b);
+        let or1 = _mm256_or_si256(eqa1, eqb1);
+        let or2 = _mm256_or_si256(eqa2, eqb2);
+        let or3 = _mm256_or_si256(eqa3, eqb3);
+        let or4 = _mm256_or_si256(or1, or2);
+        let or5 = _mm256_or_si256(or3, or4);
+        if _mm256_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm256_movemask_epi8(eqb1);
+            let mask2 = _mm256_movemask_epi8(eqb2);
+            let mask3 = _mm256_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + reverse_pos3(mask1, mask2, mask3));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm256_movemask_epi8(eqa1);
+            let mask2 = _mm256_movemask_epi8(eqa2);
+            let mask3 = _mm256_movemask_epi8(eqa3);
+            return Some(at + reverse_pos3(mask1, mask2, mask3));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn forward_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + forward_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn forward_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn forward_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+    vn3: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
+    let or = _mm256_or_si256(eq1, eq2);
+    if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        let mask3 = _mm256_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn reverse_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + reverse_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn reverse_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn reverse_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m256i,
+    vn2: __m256i,
+    vn3: __m256i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm256_loadu_si256(ptr as *const __m256i);
+    let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
+    let or = _mm256_or_si256(eq1, eq2);
+    if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
+        let mask1 = _mm256_movemask_epi8(eq1);
+        let mask2 = _mm256_movemask_epi8(eq2);
+        let mask3 = _mm256_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+/// Compute the position of the first matching byte from the given mask. The
+/// position returned is always in the range [0, 31].
+///
+/// The mask given is expected to be the result of _mm256_movemask_epi8.
+fn forward_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the least significant bit that is set
+    // corresponds to the position of our first matching byte. That position
+    // corresponds to the number of zeros after the least significant bit.
+    mask.trailing_zeros() as usize
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    let i1 = forward_pos(mask1);
+    let i2 = forward_pos(mask2);
+    if i1 < i2 { i1 } else { i2 }
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    let i1 = forward_pos(mask1);
+    let i2 = forward_pos(mask2);
+    let i3 = forward_pos(mask3);
+    if i1 < i2 && i1 < i3 {
+        i1
+    } else if i2 < i3 {
+        i2
+    } else {
+        i3
+    }
+}
+
+/// Compute the position of the last matching byte from the given mask. The
+/// position returned is always in the range [0, 31].
+///
+/// The mask given is expected to be the result of _mm256_movemask_epi8.
+fn reverse_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the most significant bit that is set
+    // corresponds to the position of our last matching byte. The position from
+    // the end of the mask is therefore the number of leading zeros in a 32
+    // bit integer, and the position from the start of the mask is therefore
+    // 32 - (leading zeros) - 1.
+    VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    if mask1 == 0 {
+        reverse_pos(mask2)
+    } else if mask2 == 0 {
+        reverse_pos(mask1)
+    } else {
+        let i1 = reverse_pos(mask1);
+        let i2 = reverse_pos(mask2);
+        if i1 > i2 { i1 } else { i2 }
+    }
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 31]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm256_movemask_epi8,
+/// where at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    if mask1 == 0 {
+        reverse_pos2(mask2, mask3)
+    } else if mask2 == 0 {
+        reverse_pos2(mask1, mask3)
+    } else if mask3 == 0 {
+        reverse_pos2(mask1, mask2)
+    } else {
+        let i1 = reverse_pos(mask1);
+        let i2 = reverse_pos(mask2);
+        let i3 = reverse_pos(mask3);
+        if i1 > i2 && i1 > i3 {
+            i1
+        } else if i2 > i3 {
+            i2
+        } else {
+            i3
+        }
+    }
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/x86/mod.rs
@@ -0,0 +1,105 @@
+use fallback;
+
+// We only use AVX when we can detect at runtime whether it's available, which
+// requires std.
+#[cfg(feature = "use_std")]
+mod avx;
+mod sse2;
+
+// This macro employs a gcc-like "ifunc" trick where by upon first calling
+// `memchr` (for example), CPU feature detection will be performed at runtime
+// to determine the best implementation to use. After CPU feature detection
+// is done, we replace `memchr`'s function pointer with the selection. Upon
+// subsequent invocations, the CPU-specific routine is invoked directly, which
+// skips the CPU feature detection and subsequent branch that's required.
+//
+// While this typically doesn't matter for rare occurrences or when used on
+// larger haystacks, `memchr` can be called in tight loops where the overhead
+// of this branch can actually add up *and is measurable*. This trick was
+// necessary to bring this implementation up to glibc's speeds for the 'tiny'
+// benchmarks, for example.
+//
+// At some point, I expect the Rust ecosystem will get a nice macro for doing
+// exactly this, at which point, we can replace our hand-jammed version of it.
+//
+// N.B. The ifunc strategy does prevent function inlining of course, but on
+// modern CPUs, you'll probably end up with the AVX2 implementation, which
+// probably can't be inlined anyway---unless you've compiled your entire
+// program with AVX2 enabled. However, even then, the various memchr
+// implementations aren't exactly small, so inlining might not help anyway!
+#[cfg(feature = "use_std")]
+macro_rules! ifunc {
+    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
+        use std::mem;
+        use std::sync::atomic::{AtomicPtr, Ordering};
+
+        type FnRaw = *mut ();
+
+        static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);
+
+        fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> {
+            let fun =
+                if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") {
+                    avx::$name as FnRaw
+                } else if cfg!(memchr_runtime_sse2) {
+                    sse2::$name as FnRaw
+                } else {
+                    fallback::$name as FnRaw
+                };
+            FN.store(fun as FnRaw, Ordering::Relaxed);
+            unsafe {
+                mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack)
+            }
+        }
+
+        unsafe {
+            let fun = FN.load(Ordering::Relaxed);
+            mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack)
+        }
+    }}
+}
+
+// When std isn't enable (which provides runtime CPU feature detection), or if
+// runtime CPU feature detection has been explicitly disabled, then just call
+// our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 targets,
+// so no CPU feature detection is necessary.
+#[cfg(not(feature = "use_std"))]
+macro_rules! ifunc {
+    ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{
+        if cfg!(memchr_runtime_sse2) {
+            unsafe { sse2::$name($($needle),+, $haystack) }
+        } else {
+            fallback::$name($($needle),+, $haystack)
+        }
+    }}
+}
+
+#[inline(always)]
+pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1)
+}
+
+#[inline(always)]
+pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memchr2, haystack, n1, n2)
+}
+
+#[inline(always)]
+pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, u8, u8, &[u8]) -> Option<usize>, memchr3, haystack, n1, n2, n3)
+}
+
+#[inline(always)]
+pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1)
+}
+
+#[inline(always)]
+pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memrchr2, haystack, n1, n2)
+}
+
+#[inline(always)]
+pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> {
+    ifunc!(fn(u8, u8, u8, &[u8]) -> Option<usize>, memrchr3, haystack, n1, n2, n3)
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/x86/sse2.rs
@@ -0,0 +1,827 @@
+use core::arch::x86_64::*;
+use core::cmp;
+use core::mem::size_of;
+
+const VECTOR_SIZE: usize = size_of::<__m128i>();
+const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+// The number of bytes to loop at in one iteration of memchr2/memrchr2 and
+// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
+// in benchmarks. memchr3 in particular only gets a very slight speed up from
+// the loop unrolling.
+const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    // What follows is a fast SSE2-only algorithm to detect the position of
+    // `n1` in `haystack` if it exists. From what I know, this is the "classic"
+    // algorithm. I believe it can be found in places like glibc and Go's
+    // standard library. It appears to be well known and is elaborated on in
+    // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
+    //
+    // While this routine is very long, the basic idea is actually very simple
+    // and can be expressed straight-forwardly in pseudo code:
+    //
+    //     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
+    //     while i <= haystack.len() - 16:
+    //       // A 16 byte vector. Each byte in chunk corresponds to a byte in
+    //       // the haystack.
+    //       chunk = haystack[i:i+16]
+    //       // Compare bytes in needle with bytes in chunk. The result is a 16
+    //       // byte chunk where each byte is 0xFF if the corresponding bytes
+    //       // in needle and chunk were equal, or 0x00 otherwise.
+    //       eqs = cmpeq(needle, chunk)
+    //       // Return a 32 bit integer where the most significant 16 bits
+    //       // are always 0 and the lower 16 bits correspond to whether the
+    //       // most significant bit in the correspond byte in `eqs` is set.
+    //       // In other words, `mask as u16` has bit i set if and only if
+    //       // needle[i] == chunk[i].
+    //       mask = movemask(eqs)
+    //
+    //       // Mask is 0 if there is no match, and non-zero otherwise.
+    //       if mask != 0:
+    //         // trailing_zeros tells us the position of the least significant
+    //         // bit that is set.
+    //         return i + trailing_zeros(mask)
+    //
+    //     // haystack length may not be a multiple of 16, so search the rest.
+    //     while i < haystack.len():
+    //       if haystack[i] == n1:
+    //         return i
+    //
+    //     // No match found.
+    //     return NULL
+    //
+    // In fact, we could loosely translate the above code to Rust line-for-line
+    // and it would be a pretty fast algorithm. But, we pull out all the stops
+    // to go as fast as possible:
+    //
+    // 1. We use aligned loads. That is, we do some finagling to make sure our
+    //    primary loop not only proceeds in increments of 16 bytes, but that
+    //    the address of haystack's pointer that we dereference is aligned to
+    //    16 bytes. 16 is a magic number here because it is the size of SSE2
+    //    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
+    //    Therefore, to get aligned loads, our pointer's address must be evenly
+    //    divisible by 16.
+    // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
+    //    kind of like loop unrolling, but we combine the equality comparisons
+    //    using a vector OR such that we only need to extract a single mask to
+    //    determine whether a match exists or not. If so, then we do some
+    //    book-keeping to determine the precise location but otherwise mush on.
+    // 3. We use our "chunk" comparison routine in as many places as possible,
+    //    even if it means using unaligned loads. In particular, if haystack
+    //    starts with an unaligned address, then we do an unaligned load to
+    //    search the first 16 bytes. We then start our primary loop at the
+    //    smallest subsequent aligned address, which will actually overlap with
+    //    previously searched bytes. But we're OK with that. We do a similar
+    //    dance at the end of our primary loop. Finally, to avoid a
+    //    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
+    //    that may overlap with a previous load. This is OK because it converts
+    //    a loop into a small number of very fast vector instructions.
+    //
+    // The primary downside of this algorithm is that it's effectively
+    // completely unsafe. Therefore, we have to be super careful to avoid
+    // undefined behavior:
+    //
+    // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
+    //    require the pointer to be valid, but we actually can't even store the
+    //    address of an invalid pointer (unless it's 1 past the end of
+    //    haystack) without sacrificing performance.
+    // 2. _mm_loadu_si128 is used when you don't care about alignment, and
+    //    _mm_load_si128 is used when you do care. You cannot use the latter
+    //    on unaligned pointers.
+    // 3. We make liberal use of debug_assert! to check assumptions.
+    // 4. We make a concerted effort to stick with pointers instead of indices.
+    //    Indices are nicer because there's less to worry about with them (see
+    //    above about pointer offsets), but I could not get the compiler to
+    //    produce as good of code as what the below produces. In any case,
+    //    pointers are what we really care about here, and alignment is
+    //    expressed a bit more naturally with them.
+    //
+    // In general, most of the algorithms in this crate have a similar
+    // structure to what you see below, so this comment applies fairly well to
+    // all of them.
+
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+        let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+        let eqa = _mm_cmpeq_epi8(vn1, a);
+        let eqb = _mm_cmpeq_epi8(vn1, b);
+        let eqc = _mm_cmpeq_epi8(vn1, c);
+        let eqd = _mm_cmpeq_epi8(vn1, d);
+        let or1 = _mm_or_si128(eqa, eqb);
+        let or2 = _mm_or_si128(eqc, eqd);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask = _mm_movemask_epi8(eqa);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqd);
+            debug_assert!(mask != 0);
+            return Some(at + forward_pos(mask));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+        if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search1(start_ptr, end_ptr, ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + forward_pos2(mask1, mask2));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            return Some(at + forward_pos2(mask1, mask2));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr3(
+    n1: u8, n2: u8, n3: u8,
+    haystack: &[u8]
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm_cmpeq_epi8(vn3, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(eqa3, eqb3);
+        let or4 = _mm_or_si128(or1, or2);
+        let or5 = _mm_or_si128(or3, or4);
+        if _mm_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            let mask3 = _mm_movemask_epi8(eqa3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            return Some(at + forward_pos3(mask1, mask2, mask3));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+        let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+        let eqa = _mm_cmpeq_epi8(vn1, a);
+        let eqb = _mm_cmpeq_epi8(vn1, b);
+        let eqc = _mm_cmpeq_epi8(vn1, c);
+        let eqd = _mm_cmpeq_epi8(vn1, d);
+        let or1 = _mm_or_si128(eqa, eqb);
+        let or2 = _mm_or_si128(eqc, eqd);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
+            let mask = _mm_movemask_epi8(eqd);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqa);
+            debug_assert!(mask != 0);
+            return Some(at + reverse_pos(mask));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + reverse_pos2(mask1, mask2));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            return Some(at + reverse_pos2(mask1, mask2));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr3(
+    n1: u8, n2: u8, n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm_cmpeq_epi8(vn3, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(eqa3, eqb3);
+        let or4 = _mm_or_si128(or1, or2);
+        let or5 = _mm_or_si128(or3, or4);
+        if _mm_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + reverse_pos3(mask1, mask2, mask3));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            let mask3 = _mm_movemask_epi8(eqa3);
+            return Some(at + reverse_pos3(mask1, mask2, mask3));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn forward_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + forward_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn forward_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn forward_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+    vn3: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm_cmpeq_epi8(chunk, vn3);
+    let or = _mm_or_si128(eq1, eq2);
+    if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        let mask3 = _mm_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + reverse_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+    vn3: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm_cmpeq_epi8(chunk, vn3);
+    let or = _mm_or_si128(eq1, eq2);
+    if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        let mask3 = _mm_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+/// Compute the position of the first matching byte from the given mask. The
+/// position returned is always in the range [0, 15].
+///
+/// The mask given is expected to be the result of _mm_movemask_epi8.
+fn forward_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the least significant bit that is set
+    // corresponds to the position of our first matching byte. That position
+    // corresponds to the number of zeros after the least significant bit.
+    mask.trailing_zeros() as usize
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    let i1 = forward_pos(mask1);
+    let i2 = forward_pos(mask2);
+    if i1 < i2 { i1 } else { i2 }
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    let i1 = forward_pos(mask1);
+    let i2 = forward_pos(mask2);
+    let i3 = forward_pos(mask3);
+    if i1 < i2 && i1 < i3 {
+        i1
+    } else if i2 < i3 {
+        i2
+    } else {
+        i3
+    }
+}
+
+/// Compute the position of the last matching byte from the given mask. The
+/// position returned is always in the range [0, 15].
+///
+/// The mask given is expected to be the result of _mm_movemask_epi8.
+fn reverse_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the most significant bit that is set
+    // corresponds to the position of our last matching byte. The position from
+    // the end of the mask is therefore the number of leading zeros in a 16
+    // bit integer, and the position from the start of the mask is therefore
+    // 16 - (leading zeros) - 1.
+    VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    if mask1 == 0 {
+        reverse_pos(mask2)
+    } else if mask2 == 0 {
+        reverse_pos(mask1)
+    } else {
+        let i1 = reverse_pos(mask1);
+        let i2 = reverse_pos(mask2);
+        if i1 > i2 { i1 } else { i2 }
+    }
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    if mask1 == 0 {
+        reverse_pos2(mask2, mask3)
+    } else if mask2 == 0 {
+        reverse_pos2(mask1, mask3)
+    } else if mask3 == 0 {
+        reverse_pos2(mask1, mask2)
+    } else {
+        let i1 = reverse_pos(mask1);
+        let i2 = reverse_pos(mask2);
+        let i3 = reverse_pos(mask3);
+        if i1 > i2 && i1 > i3 {
+            i1
+        } else if i2 > i3 {
+            i2
+        } else {
+            i3
+        }
+    }
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}
new file mode 100644
--- /dev/null
+++ b/third_party/rust/memchr/src/x86/sse42.rs
@@ -0,0 +1,75 @@
+// This code is unused. PCMPESTRI is gratuitously slow. I imagine it might
+// start winning with a hypothetical memchr4 (or greater). This technique might
+// also be good for exposing searches over ranges of bytes, but that departs
+// from the standard memchr API, so it's not clear whether we actually want
+// that or not.
+//
+// N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind
+// of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means
+// I don't see a way of effectively using PCMPISTRI unless there's some fast
+// way to replace zero bytes with a byte that is not not a needle byte.
+
+use core::arch::x86_64::*;
+use core::mem::size_of;
+
+use x86::sse2;
+
+const VECTOR_SIZE: usize = size_of::<__m128i>();
+const CONTROL_ANY: i32 =
+    _SIDD_UBYTE_OPS
+    | _SIDD_CMP_EQUAL_ANY
+    | _SIDD_POSITIVE_POLARITY
+    | _SIDD_LEAST_SIGNIFICANT;
+
+#[target_feature(enable = "sse4.2")]
+pub unsafe fn memchr3(
+    n1: u8, n2: u8, n3: u8,
+    haystack: &[u8]
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let vn = _mm_setr_epi8(
+        n1 as i8, n2 as i8, n3 as i8, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+    );
+    let len = haystack.len();
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        let chunk = _mm_loadu_si128(ptr as *const __m128i);
+        let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY);
+        if res < 16 {
+            return Some(sub(ptr, start_ptr) + res as usize);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}