Skip to content

Commit

Permalink
impl: initial import of regex-automata
Browse files Browse the repository at this point in the history
This effectively copies my regex-automata work into this crate and does
a bunch of rejiggering to make it work. In particular, we wire up its
new test harness to the public regex crate API. In this commit, that
means the regex crate API is being simultaneously tested using both the
old and new test suites.

This does *not* get rid of the old regex crate implementation. That will
happen in a subsequent commit. This is just a staging commit to prepare
for that.
  • Loading branch information
BurntSushi committed Jul 5, 2023
1 parent 1d9ce15 commit c79c40a
Show file tree
Hide file tree
Showing 216 changed files with 84,807 additions and 220 deletions.
27 changes: 18 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,27 +149,36 @@ jobs:
if: matrix.build == 'stable'
run: |
# 'stable' is Linux only, so we have bash.
cd regex-syntax
./test
./regex-syntax/test
- name: Build regex-automata docs
if: matrix.build != 'pinned'
run: |
${{ env.CARGO }} doc --verbose --manifest-path regex-automata/Cargo.toml $TARGET
- name: Run subset of regex-automata tests
if: matrix.build != 'pinned' && matrix.build != 'stable'
run: |
${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET
- name: Run full regex-automata test suite
if: matrix.build == 'stable'
run: |
# 'stable' is Linux only, so we have bash.
./regex-automata/test
- name: Run regex-capi tests
if: matrix.build == 'stable'
run: |
# 'stable' is Linux only, so we have bash.
cd regex-capi
./test
./regex-capi/test
- if: matrix.build == 'nightly'
name: Run benchmarks as tests
run: |
cd bench
./run rust --no-run --verbose
- if: matrix.build == 'nightly'
name: Run tests with pattern feature
run: |
cargo test --test default --no-default-features --features 'std pattern unicode-perl'
rustfmt:
name: rustfmt
runs-on: ubuntu-latest
Expand Down
153 changes: 134 additions & 19 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "regex"
version = "1.8.4" #:version
authors = ["The Rust Project Developers"]
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
license = "MIT OR Apache-2.0"
readme = "README.md"
repository = "https://github.com/rust-lang/regex"
Expand All @@ -19,7 +19,12 @@ rust-version = "1.60.0"

[workspace]
members = [
"bench", "regex-capi", "regex-syntax",
"bench",
"regex-automata",
"regex-capi",
"regex-cli",
"regex-syntax",
"regex-test",
]

[lib]
Expand All @@ -42,27 +47,53 @@ default = ["std", "perf", "unicode", "regex-syntax/default"]
# to compile without std, and instead just rely on 'core' and 'alloc' (for
# example). Currently, this isn't supported, and removing the 'std' feature
# will prevent regex from compiling.
std = []
std = [
"aho-corasick?/std",
"memchr?/std",
"regex-automata/std",
"regex-syntax/std",
]
# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
# then, it is an alias for the 'std' feature.
use_std = ["std"]


# PERFORMANCE FEATURES

# Enables all performance features.
perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
# Enables all default performance features. Note that this specifically does
# not include perf-dfa-full, because it leads to higher compile times and
# bigger binaries, and the runtime performance improvement is not obviously
# worth it.
perf = [
"perf-cache",
"perf-dfa",
"perf-onepass",
"perf-backtrack",
"perf-inline",
"perf-literal",
]
# Enables fast caching. (If disabled, caching is still used, but is slower.)
# Currently, this feature has no effect. It used to remove the thread_local
# dependency and use a slower internal cache, but now the default cache has
# been improved and thread_local is no longer a dependency at all.
perf-cache = []
# Enables use of a lazy DFA when possible.
perf-dfa = []
perf-dfa = ["regex-automata/hybrid"]
# Enables use of a fully compiled DFA when possible.
perf-dfa-full = ["regex-automata/dfa-build", "regex-automata/dfa-search"]
# Enables use of the one-pass regex matcher, which speeds up capture searches
# even beyond the backtracker.
perf-onepass = ["regex-automata/dfa-onepass"]
# Enables use of a bounded backtracker, which speeds up capture searches.
perf-backtrack = ["regex-automata/nfa-backtrack"]
# Enables aggressive use of inlining.
perf-inline = []
perf-inline = ["regex-automata/perf-inline"]
# Enables literal optimizations.
perf-literal = ["aho-corasick", "memchr"]
perf-literal = [
"dep:aho-corasick",
"dep:memchr",
"regex-automata/perf-literal",
]


# UNICODE DATA FEATURES
Expand All @@ -76,22 +107,45 @@ unicode = [
"unicode-perl",
"unicode-script",
"unicode-segment",
"regex-automata/unicode",
"regex-syntax/unicode",
]
# Enables use of the `Age` property, e.g., `\p{Age:3.0}`.
unicode-age = ["regex-syntax/unicode-age"]
unicode-age = [
"regex-automata/unicode-age",
"regex-syntax/unicode-age",
]
# Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`.
unicode-bool = ["regex-syntax/unicode-bool"]
unicode-bool = [
"regex-automata/unicode-bool",
"regex-syntax/unicode-bool",
]
# Enables Unicode-aware case insensitive matching, e.g., `(?i)β`.
unicode-case = ["regex-syntax/unicode-case"]
unicode-case = [
"regex-automata/unicode-case",
"regex-syntax/unicode-case",
]
# Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`.
unicode-gencat = ["regex-syntax/unicode-gencat"]
unicode-gencat = [
"regex-automata/unicode-gencat",
"regex-syntax/unicode-gencat",
]
# Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`.
unicode-perl = ["regex-syntax/unicode-perl"]
unicode-perl = [
"regex-automata/unicode-perl",
"regex-automata/unicode-word-boundary",
"regex-syntax/unicode-perl",
]
# Enables Unicode scripts and script extensions, e.g., `\p{Greek}`.
unicode-script = ["regex-syntax/unicode-script"]
unicode-script = [
"regex-automata/unicode-script",
"regex-syntax/unicode-script",
]
# Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`.
unicode-segment = ["regex-syntax/unicode-segment"]
unicode-segment = [
"regex-automata/unicode-segment",
"regex-syntax/unicode-segment",
]


# UNSTABLE FEATURES (requires Rust nightly)
Expand Down Expand Up @@ -121,6 +175,13 @@ path = "regex-syntax"
version = "0.7.2"
default-features = false

# For the actual regex engines.
[dependencies.regex-automata]
path = "regex-automata"
version = "0.3.0"
default-features = false
features = ["alloc", "syntax", "meta", "nfa-pikevm"]

[dev-dependencies]
# For examples.
lazy_static = "1"
Expand All @@ -129,10 +190,39 @@ quickcheck = { version = "1.0.3", default-features = false }
# For generating random test data.
rand = { version = "0.8.3", default-features = false, features = ["getrandom", "small_rng"] }
# To check README's example
# TODO: Re-enable this once the MSRV is 1.43 or greater.
# See: https://github.com/rust-lang/regex/issues/684
# See: https://github.com/rust-lang/regex/issues/685
# doc-comment = "0.3"
doc-comment = "0.3"
# For easy error handling in integration tests.
anyhow = "1.0.69"
# A library for testing regex engines.
regex-test = { path = "regex-test", version = "0.1.0" }

[dev-dependencies.env_logger]
# Note that this is currently using an older version because of the dependency
# tree explosion that happened in 0.10.
version = "0.9.3"
default-features = false
features = ["atty", "humantime", "termcolor"]

# This test suite reads a whole boatload of tests from the top-level testdata
# directory, and then runs them against the regex crate API.
#
# regex-automata has its own version of them, and runs them against each
# internal regex engine individually.
#
# This means that if you're seeing a failure in this test suite, you should
# try running regex-automata's tests:
#
# cargo test --manifest-path regex-automata/Cargo.toml --test integration
#
# That *might* give you a more targeted test failure. i.e., "only the
# PikeVM fails this test." Which gives you a narrower place to search. If
# regex-automata's test suite passes, then the bug might be in the integration
# of the regex crate and regex-automata. But generally speaking, a failure
# in this test suite *should* mean there is a corresponding failure in
# regex-automata's test suite.
[[test]]
path = "newtests/tests.rs"
name = "integration"

# Run the test suite on the default behavior of Regex::new.
# This includes a mish mash of NFAs and DFAs, which are chosen automatically
Expand Down Expand Up @@ -185,11 +275,36 @@ name = "backtrack-bytes"
path = "tests/test_crates_regex.rs"
name = "crates-regex"

[package.metadata.docs.rs]
# We want to document all features.
all-features = true
# Since this crate's feature setup is pretty complicated, it is worth opting
# into a nightly unstable option to show the features that need to be enabled
# for public API items. To do that, we set 'docsrs', and when that's enabled,
# we enable the 'doc_auto_cfg' feature.
#
# To test this locally, run:
#
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
rustdoc-args = ["--cfg", "docsrs"]

[profile.release]
debug = true

[profile.bench]
debug = true

[profile.dev]
# Running tests takes too long in debug mode, so we forcefully always build
# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
#
# It's counter-intuitive that this needs to be set on dev *and* test, but
# it's because the tests that take a long time to run are run as integration
# tests in a separate crate. The test.opt-level setting won't apply there, so
# we need to set the opt-level across the entire build.
opt-level = 3
debug = true

[profile.test]
opt-level = 3
debug = true
101 changes: 101 additions & 0 deletions newtests/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
use {
anyhow::Result,
regex::bytes::{Regex, RegexBuilder},
regex_test::{
CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner,
},
};

/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(super::BLACKLIST)
.test_iter(crate::suite()?.iter(), compiler)
.assert();
Ok(())
}

fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
match test.additional_name() {
"is_match" => TestResult::matched(re.is_match(test.haystack())),
"find" => TestResult::matches(
re.find_iter(test.haystack())
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: 0,
span: Span { start: m.start(), end: m.end() },
}),
),
"captures" => {
let it = re
.captures_iter(test.haystack())
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
name => TestResult::fail(&format!("unrecognized test name: {}", name)),
}
}

/// Converts the given regex test to a closure that searches with a
/// `bytes::Regex`. If the test configuration is unsupported, then a
/// `CompiledRegex` that skips the test is returned.
fn compiler(
test: &RegexTest,
_patterns: &[String],
) -> anyhow::Result<CompiledRegex> {
let skip = Ok(CompiledRegex::skip());

// We're only testing bytes::Regex here, which supports one pattern only.
let pattern = match test.regexes().len() {
1 => &test.regexes()[0],
_ => return skip,
};
// We only test is_match, find_iter and captures_iter. All of those are
// leftmost searches.
if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) {
return skip;
}
// The top-level single-pattern regex API always uses leftmost-first.
if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) {
return skip;
}
// The top-level regex API always runs unanchored searches. ... But we can
// handle tests that are anchored but have only one match.
if test.anchored() && test.match_limit() != Some(1) {
return skip;
}
// We don't support tests with explicit search bounds. We could probably
// support this by using the 'find_at' (and such) APIs.
let bounds = test.bounds();
if !(bounds.start == 0 && bounds.end == test.haystack().len()) {
return skip;
}
// The bytes::Regex API specifically does not support enabling UTF-8 mode.
// It could I suppose, but currently it does not. That is, it permits
// matches to have offsets that split codepoints.
if test.utf8() {
return skip;
}
let re = RegexBuilder::new(pattern)
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.build()?;
Ok(CompiledRegex::compiled(move |test| run_test(&re, test)))
}

/// Convert `Captures` into the test suite's capture values.
fn testify_captures(
caps: &regex::bytes::Captures<'_>,
) -> regex_test::Captures {
let spans = caps.iter().map(|group| {
group.map(|m| regex_test::Span { start: m.start(), end: m.end() })
});
// This unwrap is OK because we assume our 'caps' represents a match, and
// a match always gives a non-zero number of groups with the first group
// being non-None.
regex_test::Captures::new(0, spans).unwrap()
}
Loading

0 comments on commit c79c40a

Please sign in to comment.