impl: initial import of regex-automata

This effectively copies my regex-automata work into this crate and does a bunch of rejiggering to make it work. In particular, we wire up its new test harness to the public regex crate API. In this commit, that means the regex crate API is being simultaneously tested using both the old and new test suites. This does *not* get rid of the old regex crate implementation. That will happen in a subsequent commit. This is just a staging commit to prepare for that.
runblaze · Jul 5, 2023 · c79c40a · c79c40a
1 parent 1d9ce15
commit c79c40a
Show file tree

Hide file tree

Showing 216 changed files with 84,807 additions and 220 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -149,27 +149,36 @@ jobs:
       if: matrix.build == 'stable'
       run: |
         # 'stable' is Linux only, so we have bash.
-        cd regex-syntax
-        ./test
+        ./regex-syntax/test
+
+    - name: Build regex-automata docs
+      if: matrix.build != 'pinned'
+      run: |
+        ${{ env.CARGO }} doc --verbose --manifest-path regex-automata/Cargo.toml $TARGET
+
+    - name: Run subset of regex-automata tests
+      if: matrix.build != 'pinned' && matrix.build != 'stable'
+      run: |
+        ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET
+
+    - name: Run full regex-automata test suite
+      if: matrix.build == 'stable'
+      run: |
+        # 'stable' is Linux only, so we have bash.
+        ./regex-automata/test
 
     - name: Run regex-capi tests
       if: matrix.build == 'stable'
       run: |
         # 'stable' is Linux only, so we have bash.
-        cd regex-capi
-        ./test
+        ./regex-capi/test
 
     - if: matrix.build == 'nightly'
       name: Run benchmarks as tests
       run: |
         cd bench
         ./run rust --no-run --verbose
 
-    - if: matrix.build == 'nightly'
-      name: Run tests with pattern feature
-      run: |
-        cargo test --test default --no-default-features --features 'std pattern unicode-perl'
-
   rustfmt:
     name: rustfmt
     runs-on: ubuntu-latest

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "regex"
 version = "1.8.4"  #:version
-authors = ["The Rust Project Developers"]
+authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
 license = "MIT OR Apache-2.0"
 readme = "README.md"
 repository = "https://github.com/rust-lang/regex"
@@ -19,7 +19,12 @@ rust-version = "1.60.0"
 
 [workspace]
 members = [
-  "bench", "regex-capi", "regex-syntax",
+  "bench",
+  "regex-automata",
+  "regex-capi",
+  "regex-cli",
+  "regex-syntax",
+  "regex-test",
 ]
 
 [lib]
@@ -42,27 +47,53 @@ default = ["std", "perf", "unicode", "regex-syntax/default"]
 # to compile without std, and instead just rely on 'core' and 'alloc' (for
 # example). Currently, this isn't supported, and removing the 'std' feature
 # will prevent regex from compiling.
-std = []
+std = [
+  "aho-corasick?/std",
+  "memchr?/std",
+  "regex-automata/std",
+  "regex-syntax/std",
+]
 # The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
 # then, it is an alias for the 'std' feature.
 use_std = ["std"]
 
 
 # PERFORMANCE FEATURES
 
-# Enables all performance features.
-perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
+# Enables all default performance features. Note that this specifically does
+# not include perf-dfa-full, because it leads to higher compile times and
+# bigger binaries, and the runtime performance improvement is not obviously
+# worth it.
+perf = [
+  "perf-cache",
+  "perf-dfa",
+  "perf-onepass",
+  "perf-backtrack",
+  "perf-inline",
+  "perf-literal",
+]
 # Enables fast caching. (If disabled, caching is still used, but is slower.)
 # Currently, this feature has no effect. It used to remove the thread_local
 # dependency and use a slower internal cache, but now the default cache has
 # been improved and thread_local is no longer a dependency at all.
 perf-cache = []
 # Enables use of a lazy DFA when possible.
-perf-dfa = []
+perf-dfa = ["regex-automata/hybrid"]
+# Enables use of a fully compiled DFA when possible.
+perf-dfa-full = ["regex-automata/dfa-build", "regex-automata/dfa-search"]
+# Enables use of the one-pass regex matcher, which speeds up capture searches
+# even beyond the backtracker.
+perf-onepass = ["regex-automata/dfa-onepass"]
+# Enables use of a bounded backtracker, which speeds up capture searches.
+perf-backtrack = ["regex-automata/nfa-backtrack"]
 # Enables aggressive use of inlining.
-perf-inline = []
+perf-inline = ["regex-automata/perf-inline"]
 # Enables literal optimizations.
-perf-literal = ["aho-corasick", "memchr"]
+perf-literal = [
+  "dep:aho-corasick",
+  "dep:memchr",
+  "regex-automata/perf-literal",
+]
 
 
 # UNICODE DATA FEATURES
@@ -76,22 +107,45 @@ unicode = [
   "unicode-perl",
   "unicode-script",
   "unicode-segment",
+  "regex-automata/unicode",
   "regex-syntax/unicode",
 ]
 # Enables use of the `Age` property, e.g., `\p{Age:3.0}`.
-unicode-age = ["regex-syntax/unicode-age"]
+unicode-age = [
+  "regex-automata/unicode-age",
+  "regex-syntax/unicode-age",
+]
 # Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`.
-unicode-bool = ["regex-syntax/unicode-bool"]
+unicode-bool = [
+  "regex-automata/unicode-bool",
+  "regex-syntax/unicode-bool",
+]
 # Enables Unicode-aware case insensitive matching, e.g., `(?i)β`.
-unicode-case = ["regex-syntax/unicode-case"]
+unicode-case = [
+  "regex-automata/unicode-case",
+  "regex-syntax/unicode-case",
+]
 # Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`.
-unicode-gencat = ["regex-syntax/unicode-gencat"]
+unicode-gencat = [
+  "regex-automata/unicode-gencat",
+  "regex-syntax/unicode-gencat",
+]
 # Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`.
-unicode-perl = ["regex-syntax/unicode-perl"]
+unicode-perl = [
+  "regex-automata/unicode-perl",
+  "regex-automata/unicode-word-boundary",
+  "regex-syntax/unicode-perl",
+]
 # Enables Unicode scripts and script extensions, e.g., `\p{Greek}`.
-unicode-script = ["regex-syntax/unicode-script"]
+unicode-script = [
+  "regex-automata/unicode-script",
+  "regex-syntax/unicode-script",
+]
 # Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`.
-unicode-segment = ["regex-syntax/unicode-segment"]
+unicode-segment = [
+  "regex-automata/unicode-segment",
+  "regex-syntax/unicode-segment",
+]
 
 
 # UNSTABLE FEATURES (requires Rust nightly)
@@ -121,6 +175,13 @@ path = "regex-syntax"
 version = "0.7.2"
 default-features = false
 
+# For the actual regex engines.
+[dependencies.regex-automata]
+path = "regex-automata"
+version = "0.3.0"
+default-features = false
+features = ["alloc", "syntax", "meta", "nfa-pikevm"]
+
 [dev-dependencies]
 # For examples.
 lazy_static = "1"
@@ -129,10 +190,39 @@ quickcheck = { version = "1.0.3", default-features = false }
 # For generating random test data.
 rand = { version = "0.8.3", default-features = false, features = ["getrandom", "small_rng"] }
 # To check README's example
-# TODO: Re-enable this once the MSRV is 1.43 or greater.
-# See: https://github.com/rust-lang/regex/issues/684
-# See: https://github.com/rust-lang/regex/issues/685
-# doc-comment = "0.3"
+doc-comment = "0.3"
+# For easy error handling in integration tests.
+anyhow = "1.0.69"
+# A library for testing regex engines.
+regex-test = { path = "regex-test", version = "0.1.0" }
+
+[dev-dependencies.env_logger]
+# Note that this is currently using an older version because of the dependency
+# tree explosion that happened in 0.10.
+version = "0.9.3"
+default-features = false
+features = ["atty", "humantime", "termcolor"]
+
+# This test suite reads a whole boatload of tests from the top-level testdata
+# directory, and then runs them against the regex crate API.
+#
+# regex-automata has its own version of them, and runs them against each
+# internal regex engine individually.
+#
+# This means that if you're seeing a failure in this test suite, you should
+# try running regex-automata's tests:
+#
+#     cargo test --manifest-path regex-automata/Cargo.toml --test integration
+#
+# That *might* give you a more targeted test failure. i.e., "only the
+# PikeVM fails this test." Which gives you a narrower place to search. If
+# regex-automata's test suite passes, then the bug might be in the integration
+# of the regex crate and regex-automata. But generally speaking, a failure
+# in this test suite *should* mean there is a corresponding failure in
+# regex-automata's test suite.
+[[test]]
+path = "newtests/tests.rs"
+name = "integration"
 
 # Run the test suite on the default behavior of Regex::new.
 # This includes a mish mash of NFAs and DFAs, which are chosen automatically
@@ -185,11 +275,36 @@ name = "backtrack-bytes"
 path = "tests/test_crates_regex.rs"
 name = "crates-regex"
 
+[package.metadata.docs.rs]
+# We want to document all features.
+all-features = true
+# Since this crate's feature setup is pretty complicated, it is worth opting
+# into a nightly unstable option to show the features that need to be enabled
+# for public API items. To do that, we set 'docsrs', and when that's enabled,
+# we enable the 'doc_auto_cfg' feature.
+#
+# To test this locally, run:
+#
+#     RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
+rustdoc-args = ["--cfg", "docsrs"]
+
 [profile.release]
 debug = true
 
 [profile.bench]
 debug = true
 
+[profile.dev]
+# Running tests takes too long in debug mode, so we forcefully always build
+# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
+#
+# It's counter-intuitive that this needs to be set on dev *and* test, but
+# it's because the tests that take a long time to run are run as integration
+# tests in a separate crate. The test.opt-level setting won't apply there, so
+# we need to set the opt-level across the entire build.
+opt-level = 3
+debug = true
+
 [profile.test]
+opt-level = 3
 debug = true
diff --git a/newtests/bytes.rs b/newtests/bytes.rs
@@ -0,0 +1,101 @@
+use {
+    anyhow::Result,
+    regex::bytes::{Regex, RegexBuilder},
+    regex_test::{
+        CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner,
+    },
+};
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+    let mut runner = TestRunner::new()?;
+    runner
+        .expand(&["is_match", "find", "captures"], |test| test.compiles())
+        .blacklist_iter(super::BLACKLIST)
+        .test_iter(crate::suite()?.iter(), compiler)
+        .assert();
+    Ok(())
+}
+
+fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
+    match test.additional_name() {
+        "is_match" => TestResult::matched(re.is_match(test.haystack())),
+        "find" => TestResult::matches(
+            re.find_iter(test.haystack())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: 0,
+                    span: Span { start: m.start(), end: m.end() },
+                }),
+        ),
+        "captures" => {
+            let it = re
+                .captures_iter(test.haystack())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|caps| testify_captures(&caps));
+            TestResult::captures(it)
+        }
+        name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+    }
+}
+
+/// Converts the given regex test to a closure that searches with a
+/// `bytes::Regex`. If the test configuration is unsupported, then a
+/// `CompiledRegex` that skips the test is returned.
+fn compiler(
+    test: &RegexTest,
+    _patterns: &[String],
+) -> anyhow::Result<CompiledRegex> {
+    let skip = Ok(CompiledRegex::skip());
+
+    // We're only testing bytes::Regex here, which supports one pattern only.
+    let pattern = match test.regexes().len() {
+        1 => &test.regexes()[0],
+        _ => return skip,
+    };
+    // We only test is_match, find_iter and captures_iter. All of those are
+    // leftmost searches.
+    if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) {
+        return skip;
+    }
+    // The top-level single-pattern regex API always uses leftmost-first.
+    if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) {
+        return skip;
+    }
+    // The top-level regex API always runs unanchored searches. ... But we can
+    // handle tests that are anchored but have only one match.
+    if test.anchored() && test.match_limit() != Some(1) {
+        return skip;
+    }
+    // We don't support tests with explicit search bounds. We could probably
+    // support this by using the 'find_at' (and such) APIs.
+    let bounds = test.bounds();
+    if !(bounds.start == 0 && bounds.end == test.haystack().len()) {
+        return skip;
+    }
+    // The bytes::Regex API specifically does not support enabling UTF-8 mode.
+    // It could I suppose, but currently it does not. That is, it permits
+    // matches to have offsets that split codepoints.
+    if test.utf8() {
+        return skip;
+    }
+    let re = RegexBuilder::new(pattern)
+        .case_insensitive(test.case_insensitive())
+        .unicode(test.unicode())
+        .build()?;
+    Ok(CompiledRegex::compiled(move |test| run_test(&re, test)))
+}
+
+/// Convert `Captures` into the test suite's capture values.
+fn testify_captures(
+    caps: &regex::bytes::Captures<'_>,
+) -> regex_test::Captures {
+    let spans = caps.iter().map(|group| {
+        group.map(|m| regex_test::Span { start: m.start(), end: m.end() })
+    });
+    // This unwrap is OK because we assume our 'caps' represents a match, and
+    // a match always gives a non-zero number of groups with the first group
+    // being non-None.
+    regex_test::Captures::new(0, spans).unwrap()
+}