third_party/rust/regex/src/re_trait.rs
author Dorel Luca <dluca@mozilla.com>
Sat, 12 Jan 2019 03:43:46 +0200
changeset 453620 def9811f0311
parent 453619 3c4b8e03e722
child 460816 c5e856d5edba
permissions -rw-r--r--
Backed out 2 changesets (bug 1516337) for build bustage. CLOSED TREE Backed out changeset 3c4b8e03e722 (bug 1516337) Backed out changeset 4fc377013db5 (bug 1516337)

// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

/// Slot is a single saved capture location. Note that there are two slots for
/// every capture in a regular expression (one slot each for the start and end
/// of the capture).
pub type Slot = Option<usize>;

/// Locations represents the offsets of each capturing group in a regex for
/// a single match.
///
/// Unlike `Captures`, a `Locations` value only stores offsets.
#[doc(hidden)]
pub struct Locations(Vec<Slot>);

impl Locations {
    /// Returns the start and end positions of the Nth capture group. Returns
    /// `None` if `i` is not a valid capture group or if the capture group did
    /// not match anything. The positions returned are *always* byte indices
    /// with respect to the original string matched.
    pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
        let (s, e) = (i * 2, i * 2 + 1);
        match (self.0.get(s), self.0.get(e)) {
            (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
            _ => None,
        }
    }

    /// Creates an iterator of all the capture group positions in order of
    /// appearance in the regular expression. Positions are byte indices
    /// in terms of the original string matched.
    pub fn iter(&self) -> SubCapturesPosIter {
        SubCapturesPosIter { idx: 0, locs: self }
    }

    /// Returns the total number of capturing groups.
    ///
    /// This is always at least `1` since every regex has at least `1`
    /// capturing group that corresponds to the entire match.
    pub fn len(&self) -> usize {
        self.0.len() / 2
    }
}

/// This is a hack to make Locations -> &mut [Slot] be available internally
/// without exposing it in the public API.
pub fn as_slots(locs: &mut Locations) -> &mut [Slot] {
    &mut locs.0
}

/// An iterator over capture group positions for a particular match of a
/// regular expression.
///
/// Positions are byte indices in terms of the original string matched.
///
/// `'c` is the lifetime of the captures.
pub struct SubCapturesPosIter<'c> {
    idx: usize,
    locs: &'c Locations,
}

impl<'c> Iterator for SubCapturesPosIter<'c> {
    type Item = Option<(usize, usize)>;

    fn next(&mut self) -> Option<Option<(usize, usize)>> {
        if self.idx >= self.locs.len() {
            return None;
        }
        let x = match self.locs.pos(self.idx) {
            None => Some(None),
            Some((s, e)) => {
                Some(Some((s, e)))
            }
        };
        self.idx += 1;
        x
    }
}

/// `RegularExpression` describes types that can implement regex searching.
///
/// This trait is my attempt at reducing code duplication and to standardize
/// the internal API. Specific duplication that is avoided are the `find`
/// and `capture` iterators, which are slightly tricky.
///
/// It's not clear whether this trait is worth it, and it also isn't
/// clear whether it's useful as a public trait or not. Methods like
/// `next_after_empty` reak of bad design, but the rest of the methods seem
/// somewhat reasonable. One particular thing this trait would expose would be
/// the ability to start the search of a regex anywhere in a haystack, which
/// isn't possible in the current public API.
pub trait RegularExpression: Sized {
    /// The type of the haystack.
    type Text: ?Sized;

    /// The number of capture slots in the compiled regular expression. This is
    /// always two times the number of capture groups (two slots per group).
    fn slots_len(&self) -> usize;

    /// Allocates fresh space for all capturing groups in this regex.
    fn locations(&self) -> Locations {
        Locations(vec![None; self.slots_len()])
    }

    /// Returns the position of the next character after `i`.
    ///
    /// For example, a haystack with type `&[u8]` probably returns `i+1`,
    /// whereas a haystack with type `&str` probably returns `i` plus the
    /// length of the next UTF-8 sequence.
    fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;

    /// Returns the location of the shortest match.
    fn shortest_match_at(
        &self,
        text: &Self::Text,
        start: usize,
    ) -> Option<usize>;

    /// Returns whether the regex matches the text given.
    fn is_match_at(
        &self,
        text: &Self::Text,
        start: usize,
    ) -> bool;

    /// Returns the leftmost-first match location if one exists.
    fn find_at(
        &self,
        text: &Self::Text,
        start: usize,
    ) -> Option<(usize, usize)>;

    /// Returns the leftmost-first match location if one exists, and also
    /// fills in any matching capture slot locations.
    fn read_captures_at(
        &self,
        locs: &mut Locations,
        text: &Self::Text,
        start: usize,
    ) -> Option<(usize, usize)>;

    /// Returns an iterator over all non-overlapping successive leftmost-first
    /// matches.
    fn find_iter (
        self,
        text: &Self::Text,
    ) -> Matches<Self> {
        Matches {
            re: self,
            text: text,
            last_end: 0,
            last_match: None,
        }
    }

    /// Returns an iterator over all non-overlapping successive leftmost-first
    /// matches with captures.
    fn captures_iter(
        self,
        text: &Self::Text,
    ) -> CaptureMatches<Self> {
        CaptureMatches(self.find_iter(text))
    }
}

/// An iterator over all non-overlapping successive leftmost-first matches.
pub struct Matches<'t, R> where R: RegularExpression, R::Text: 't {
    re: R,
    text: &'t R::Text,
    last_end: usize,
    last_match: Option<usize>,
}

impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't {
    /// Return the text being searched.
    pub fn text(&self) -> &'t R::Text {
        self.text
    }

    /// Return the underlying regex.
    pub fn regex(&self) -> &R {
        &self.re
    }
}

impl<'t, R> Iterator for Matches<'t, R>
        where R: RegularExpression, R::Text: 't + AsRef<[u8]> {
    type Item = (usize, usize);

    fn next(&mut self) -> Option<(usize, usize)> {
        if self.last_end > self.text.as_ref().len() {
            return None;
        }
        let (s, e) = match self.re.find_at(self.text, self.last_end) {
            None => return None,
            Some((s, e)) => (s, e),
        };
        if s == e {
            // This is an empty match. To ensure we make progress, start
            // the next search at the smallest possible starting position
            // of the next match following this one.
            self.last_end = self.re.next_after_empty(self.text, e);
            // Don't accept empty matches immediately following a match.
            // Just move on to the next match.
            if Some(e) == self.last_match {
                return self.next();
            }
        } else {
            self.last_end = e;
        }
        self.last_match = Some(e);
        Some((s, e))
    }
}

/// An iterator over all non-overlapping successive leftmost-first matches with
/// captures.
pub struct CaptureMatches<'t, R>(Matches<'t, R>)
    where R: RegularExpression, R::Text: 't;

impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't {
    /// Return the text being searched.
    pub fn text(&self) -> &'t R::Text {
        self.0.text()
    }

    /// Return the underlying regex.
    pub fn regex(&self) -> &R {
        self.0.regex()
    }
}

impl<'t, R> Iterator for CaptureMatches<'t, R>
        where R: RegularExpression, R::Text: 't + AsRef<[u8]> {
    type Item = Locations;

    fn next(&mut self) -> Option<Locations> {
        if self.0.last_end > self.0.text.as_ref().len() {
            return None
        }
        let mut locs = self.0.re.locations();
        let (s, e) = match self.0.re.read_captures_at(
            &mut locs,
            self.0.text,
            self.0.last_end,
        ) {
            None => return None,
            Some((s, e)) => (s, e),
        };
        if s == e {
            self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
            if Some(e) == self.0.last_match {
                return self.next();
            }
        } else {
            self.0.last_end = e;
        }
        self.0.last_match = Some(e);
        Some(locs)
    }
}