forked from rust-lang/regex
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
impl: initial import of regex-automata
This effectively copies my regex-automata work into this crate and does a bunch of rejiggering to make it work. In particular, we wire up its new test harness to the public regex crate API. In this commit, that means the regex crate API is being simultaneously tested using both the old and new test suites. This does *not* get rid of the old regex crate implementation. That will happen in a subsequent commit. This is just a staging commit to prepare for that.
- Loading branch information
1 parent
1d9ce15
commit c79c40a
Showing
216 changed files
with
84,807 additions
and
220 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
[package] | ||
name = "regex" | ||
version = "1.8.4" #:version | ||
authors = ["The Rust Project Developers"] | ||
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"] | ||
license = "MIT OR Apache-2.0" | ||
readme = "README.md" | ||
repository = "https://github.com/rust-lang/regex" | ||
|
@@ -19,7 +19,12 @@ rust-version = "1.60.0" | |
|
||
[workspace] | ||
members = [ | ||
"bench", "regex-capi", "regex-syntax", | ||
"bench", | ||
"regex-automata", | ||
"regex-capi", | ||
"regex-cli", | ||
"regex-syntax", | ||
"regex-test", | ||
] | ||
|
||
[lib] | ||
|
@@ -42,27 +47,53 @@ default = ["std", "perf", "unicode", "regex-syntax/default"] | |
# to compile without std, and instead just rely on 'core' and 'alloc' (for | ||
# example). Currently, this isn't supported, and removing the 'std' feature | ||
# will prevent regex from compiling. | ||
std = [] | ||
std = [ | ||
"aho-corasick?/std", | ||
"memchr?/std", | ||
"regex-automata/std", | ||
"regex-syntax/std", | ||
] | ||
# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until | ||
# then, it is an alias for the 'std' feature. | ||
use_std = ["std"] | ||
|
||
|
||
# PERFORMANCE FEATURES | ||
|
||
# Enables all performance features. | ||
perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"] | ||
# Enables all default performance features. Note that this specifically does | ||
# not include perf-dfa-full, because it leads to higher compile times and | ||
# bigger binaries, and the runtime performance improvement is not obviously | ||
# worth it. | ||
perf = [ | ||
"perf-cache", | ||
"perf-dfa", | ||
"perf-onepass", | ||
"perf-backtrack", | ||
"perf-inline", | ||
"perf-literal", | ||
] | ||
# Enables fast caching. (If disabled, caching is still used, but is slower.) | ||
# Currently, this feature has no effect. It used to remove the thread_local | ||
# dependency and use a slower internal cache, but now the default cache has | ||
# been improved and thread_local is no longer a dependency at all. | ||
perf-cache = [] | ||
# Enables use of a lazy DFA when possible. | ||
perf-dfa = [] | ||
perf-dfa = ["regex-automata/hybrid"] | ||
# Enables use of a fully compiled DFA when possible. | ||
perf-dfa-full = ["regex-automata/dfa-build", "regex-automata/dfa-search"] | ||
# Enables use of the one-pass regex matcher, which speeds up capture searches | ||
# even beyond the backtracker. | ||
perf-onepass = ["regex-automata/dfa-onepass"] | ||
# Enables use of a bounded backtracker, which speeds up capture searches. | ||
perf-backtrack = ["regex-automata/nfa-backtrack"] | ||
# Enables aggressive use of inlining. | ||
perf-inline = [] | ||
perf-inline = ["regex-automata/perf-inline"] | ||
# Enables literal optimizations. | ||
perf-literal = ["aho-corasick", "memchr"] | ||
perf-literal = [ | ||
"dep:aho-corasick", | ||
"dep:memchr", | ||
"regex-automata/perf-literal", | ||
] | ||
|
||
|
||
# UNICODE DATA FEATURES | ||
|
@@ -76,22 +107,45 @@ unicode = [ | |
"unicode-perl", | ||
"unicode-script", | ||
"unicode-segment", | ||
"regex-automata/unicode", | ||
"regex-syntax/unicode", | ||
] | ||
# Enables use of the `Age` property, e.g., `\p{Age:3.0}`. | ||
unicode-age = ["regex-syntax/unicode-age"] | ||
unicode-age = [ | ||
"regex-automata/unicode-age", | ||
"regex-syntax/unicode-age", | ||
] | ||
# Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`. | ||
unicode-bool = ["regex-syntax/unicode-bool"] | ||
unicode-bool = [ | ||
"regex-automata/unicode-bool", | ||
"regex-syntax/unicode-bool", | ||
] | ||
# Enables Unicode-aware case insensitive matching, e.g., `(?i)β`. | ||
unicode-case = ["regex-syntax/unicode-case"] | ||
unicode-case = [ | ||
"regex-automata/unicode-case", | ||
"regex-syntax/unicode-case", | ||
] | ||
# Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`. | ||
unicode-gencat = ["regex-syntax/unicode-gencat"] | ||
unicode-gencat = [ | ||
"regex-automata/unicode-gencat", | ||
"regex-syntax/unicode-gencat", | ||
] | ||
# Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`. | ||
unicode-perl = ["regex-syntax/unicode-perl"] | ||
unicode-perl = [ | ||
"regex-automata/unicode-perl", | ||
"regex-automata/unicode-word-boundary", | ||
"regex-syntax/unicode-perl", | ||
] | ||
# Enables Unicode scripts and script extensions, e.g., `\p{Greek}`. | ||
unicode-script = ["regex-syntax/unicode-script"] | ||
unicode-script = [ | ||
"regex-automata/unicode-script", | ||
"regex-syntax/unicode-script", | ||
] | ||
# Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`. | ||
unicode-segment = ["regex-syntax/unicode-segment"] | ||
unicode-segment = [ | ||
"regex-automata/unicode-segment", | ||
"regex-syntax/unicode-segment", | ||
] | ||
|
||
|
||
# UNSTABLE FEATURES (requires Rust nightly) | ||
|
@@ -121,6 +175,13 @@ path = "regex-syntax" | |
version = "0.7.2" | ||
default-features = false | ||
|
||
# For the actual regex engines. | ||
[dependencies.regex-automata] | ||
path = "regex-automata" | ||
version = "0.3.0" | ||
default-features = false | ||
features = ["alloc", "syntax", "meta", "nfa-pikevm"] | ||
|
||
[dev-dependencies] | ||
# For examples. | ||
lazy_static = "1" | ||
|
@@ -129,10 +190,39 @@ quickcheck = { version = "1.0.3", default-features = false } | |
# For generating random test data. | ||
rand = { version = "0.8.3", default-features = false, features = ["getrandom", "small_rng"] } | ||
# To check README's example | ||
# TODO: Re-enable this once the MSRV is 1.43 or greater. | ||
# See: https://github.com/rust-lang/regex/issues/684 | ||
# See: https://github.com/rust-lang/regex/issues/685 | ||
# doc-comment = "0.3" | ||
doc-comment = "0.3" | ||
# For easy error handling in integration tests. | ||
anyhow = "1.0.69" | ||
# A library for testing regex engines. | ||
regex-test = { path = "regex-test", version = "0.1.0" } | ||
|
||
[dev-dependencies.env_logger] | ||
# Note that this is currently using an older version because of the dependency | ||
# tree explosion that happened in 0.10. | ||
version = "0.9.3" | ||
default-features = false | ||
features = ["atty", "humantime", "termcolor"] | ||
|
||
# This test suite reads a whole boatload of tests from the top-level testdata | ||
# directory, and then runs them against the regex crate API. | ||
# | ||
# regex-automata has its own version of them, and runs them against each | ||
# internal regex engine individually. | ||
# | ||
# This means that if you're seeing a failure in this test suite, you should | ||
# try running regex-automata's tests: | ||
# | ||
# cargo test --manifest-path regex-automata/Cargo.toml --test integration | ||
# | ||
# That *might* give you a more targeted test failure. i.e., "only the | ||
# PikeVM fails this test." Which gives you a narrower place to search. If | ||
# regex-automata's test suite passes, then the bug might be in the integration | ||
# of the regex crate and regex-automata. But generally speaking, a failure | ||
# in this test suite *should* mean there is a corresponding failure in | ||
# regex-automata's test suite. | ||
[[test]] | ||
path = "newtests/tests.rs" | ||
name = "integration" | ||
|
||
# Run the test suite on the default behavior of Regex::new. | ||
# This includes a mish mash of NFAs and DFAs, which are chosen automatically | ||
|
@@ -185,11 +275,36 @@ name = "backtrack-bytes" | |
path = "tests/test_crates_regex.rs" | ||
name = "crates-regex" | ||
|
||
[package.metadata.docs.rs] | ||
# We want to document all features. | ||
all-features = true | ||
# Since this crate's feature setup is pretty complicated, it is worth opting | ||
# into a nightly unstable option to show the features that need to be enabled | ||
# for public API items. To do that, we set 'docsrs', and when that's enabled, | ||
# we enable the 'doc_auto_cfg' feature. | ||
# | ||
# To test this locally, run: | ||
# | ||
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features | ||
rustdoc-args = ["--cfg", "docsrs"] | ||
|
||
[profile.release] | ||
debug = true | ||
|
||
[profile.bench] | ||
debug = true | ||
|
||
[profile.dev] | ||
# Running tests takes too long in debug mode, so we forcefully always build | ||
# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯. | ||
# | ||
# It's counter-intuitive that this needs to be set on dev *and* test, but | ||
# it's because the tests that take a long time to run are run as integration | ||
# tests in a separate crate. The test.opt-level setting won't apply there, so | ||
# we need to set the opt-level across the entire build. | ||
opt-level = 3 | ||
debug = true | ||
|
||
[profile.test] | ||
opt-level = 3 | ||
debug = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
use { | ||
anyhow::Result, | ||
regex::bytes::{Regex, RegexBuilder}, | ||
regex_test::{ | ||
CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner, | ||
}, | ||
}; | ||
|
||
/// Tests the default configuration of the hybrid NFA/DFA. | ||
#[test] | ||
fn default() -> Result<()> { | ||
let mut runner = TestRunner::new()?; | ||
runner | ||
.expand(&["is_match", "find", "captures"], |test| test.compiles()) | ||
.blacklist_iter(super::BLACKLIST) | ||
.test_iter(crate::suite()?.iter(), compiler) | ||
.assert(); | ||
Ok(()) | ||
} | ||
|
||
fn run_test(re: &Regex, test: &RegexTest) -> TestResult { | ||
match test.additional_name() { | ||
"is_match" => TestResult::matched(re.is_match(test.haystack())), | ||
"find" => TestResult::matches( | ||
re.find_iter(test.haystack()) | ||
.take(test.match_limit().unwrap_or(std::usize::MAX)) | ||
.map(|m| Match { | ||
id: 0, | ||
span: Span { start: m.start(), end: m.end() }, | ||
}), | ||
), | ||
"captures" => { | ||
let it = re | ||
.captures_iter(test.haystack()) | ||
.take(test.match_limit().unwrap_or(std::usize::MAX)) | ||
.map(|caps| testify_captures(&caps)); | ||
TestResult::captures(it) | ||
} | ||
name => TestResult::fail(&format!("unrecognized test name: {}", name)), | ||
} | ||
} | ||
|
||
/// Converts the given regex test to a closure that searches with a | ||
/// `bytes::Regex`. If the test configuration is unsupported, then a | ||
/// `CompiledRegex` that skips the test is returned. | ||
fn compiler( | ||
test: &RegexTest, | ||
_patterns: &[String], | ||
) -> anyhow::Result<CompiledRegex> { | ||
let skip = Ok(CompiledRegex::skip()); | ||
|
||
// We're only testing bytes::Regex here, which supports one pattern only. | ||
let pattern = match test.regexes().len() { | ||
1 => &test.regexes()[0], | ||
_ => return skip, | ||
}; | ||
// We only test is_match, find_iter and captures_iter. All of those are | ||
// leftmost searches. | ||
if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) { | ||
return skip; | ||
} | ||
// The top-level single-pattern regex API always uses leftmost-first. | ||
if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) { | ||
return skip; | ||
} | ||
// The top-level regex API always runs unanchored searches. ... But we can | ||
// handle tests that are anchored but have only one match. | ||
if test.anchored() && test.match_limit() != Some(1) { | ||
return skip; | ||
} | ||
// We don't support tests with explicit search bounds. We could probably | ||
// support this by using the 'find_at' (and such) APIs. | ||
let bounds = test.bounds(); | ||
if !(bounds.start == 0 && bounds.end == test.haystack().len()) { | ||
return skip; | ||
} | ||
// The bytes::Regex API specifically does not support enabling UTF-8 mode. | ||
// It could I suppose, but currently it does not. That is, it permits | ||
// matches to have offsets that split codepoints. | ||
if test.utf8() { | ||
return skip; | ||
} | ||
let re = RegexBuilder::new(pattern) | ||
.case_insensitive(test.case_insensitive()) | ||
.unicode(test.unicode()) | ||
.build()?; | ||
Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) | ||
} | ||
|
||
/// Convert `Captures` into the test suite's capture values. | ||
fn testify_captures( | ||
caps: ®ex::bytes::Captures<'_>, | ||
) -> regex_test::Captures { | ||
let spans = caps.iter().map(|group| { | ||
group.map(|m| regex_test::Span { start: m.start(), end: m.end() }) | ||
}); | ||
// This unwrap is OK because we assume our 'caps' represents a match, and | ||
// a match always gives a non-zero number of groups with the first group | ||
// being non-None. | ||
regex_test::Captures::new(0, spans).unwrap() | ||
} |
Oops, something went wrong.