-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
http-url fix to support multiple directories in the path
- Loading branch information
0 parents
commit 3de590e
Showing
26 changed files
with
7,339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
syntax: glob | ||
*.orig | ||
*.rej | ||
*~ | ||
*.c | ||
*.o | ||
*.so | ||
*.aux | ||
*.log | ||
*.lof | ||
*.lot | ||
*.elc | ||
*.err | ||
*.exports | ||
PROFILE.* | ||
*.tar.bz2 | ||
*.tar.gz | ||
*.tgz | ||
.Z-*.scm | ||
*cl-ppcre* | ||
regex-dna* | ||
regexdna* | ||
|
||
syntax: regexp | ||
.*\#.*\#$ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
|
||
.PHONY: all clean distclean test doc | ||
|
||
all: doc | ||
|
||
# these crazy make hacks will go away with the 0.8 refactoring | ||
|
||
R6RS_DEFINITIONS=find|filter|remove|unicode-.*|char->utf8-list|cset->utf8-pattern | ||
|
||
irregex-r6rs.scm: irregex.scm Makefile | ||
perl -e 'BEGIN{$$/=""}' -ane "s/^(\\(define \\*allow-utf8-mode\\?\\* +#).(.*)/\\1f\\2/sm; s/^(\\(define \([-\\w]*utf8[-\\w]* ([-\\w]+) [^)]*\)).*/\\1 \\2)\n\n/sm; print unless /^\(define +\(?($(R6RS_DEFINITIONS))\\s/" < $< > $@ | ||
|
||
irregex-stalin.scm: irregex.scm regex-dna.scm read-string.scm error.scm | ||
cat $^ | grep -v '^(use ' > $@ | ||
|
||
irregex-jazz.scm: irregex.scm Makefile | ||
perl -e 'BEGIN{$$/=""}' -e 'END{print "\n)\n"}' -ape "s/^(\\(define \\*allow-utf8-mode\\?\\* +#).(.*)/\\1f\\2/sm; s/^(\\(define irregex-tag .*)/(unit irregex.implementation.irregex\n\n(declare (proper-tail-calls) (block) (fixnum) (inline) (inlining-limit 700) (standard-bindings) (extended-bindings))\n\n\\1/sm; s/^\\(define \\(?integer-log .*/(define (integer-log n) (if (zero? n) n (- (integer-length n) 1)))\n\n/sm; s/^\\(define \\(bit-.*//sm; s/\\bbit-(not|ior|and)\\b/bitwise-\\1/gsm; s/\\bbit-shl\\b/fxarithmetic-shift-left/gsm; s/\\bbit-shr\\b/fxarithmetic-shift-right/gsm; if (/^\\(define .*multi-state/) {s/24/16/gsm; s/\\b(vector)\\b/u16\\1/gsm; s/\\(u16vector-ref mst 0\\)/(equal?-hash mst)/gsm; }" < $< > $@ | ||
|
||
irregex-base.html: irregex.doc irregex.mistie irregex.css | ||
csi -R mistie -e '(mistie-load "plain.mistie")' \ | ||
-e '(mistie-load "scmhilit.mistie")' \ | ||
-e '(define h-page-count 0)' \ | ||
-e '(mistie-load "xref.mistie")' \ | ||
-e '(mistie-load "timestamp.mistie")' \ | ||
-e '(mistie-load "irregex.mistie")' \ | ||
-e '(mistie-main "$<")' > $@ | ||
|
||
irregex.html: irregex-base.html | ||
grep '^<a name="SECTION_' $< |\ | ||
perl -ape 's{<a name="(SECTION_(\d+)(\.\d+)?)"><h[12]>(?:[.\d]+)(?:\s| )*([^<>]*).*}{($$3?($$3==.1?($$sub=1,"<ol>\n"):""):(($$x=$$sub,$$sub=0,$$x>0)?"</ol>\n":""))."<li><a href=\"#$$1\">$$4</a>"}ge;' > irregex-toc.html | ||
perl -ape 's{^<!--\s*TOC\s*-->}{"<ol>\n".`cat irregex-toc.html`."</ol>"}e' $< > $@ | ||
rm -f $< irregex-toc.html | ||
|
||
doc: irregex.html | ||
|
||
test: | ||
csi -script test-all.scm | ||
|
||
clean: | ||
rm -f *~ */*~ *.so | ||
|
||
distclean: clean | ||
rm -f *.html | ||
|
||
dist: doc | ||
rm -f irregex-`cat VERSION`.tar.gz | ||
mkdir irregex-`cat VERSION` | ||
for f in `hg manifest`; do mkdir -p irregex-`cat VERSION`/`dirname $$f`; ln -s `pwd`/$$f irregex-`cat VERSION`/$$f; done | ||
cd irregex-`cat VERSION`; for f in `echo ../*.html`; do ln -s $$f; done; cd .. | ||
tar cphzvf irregex-`cat VERSION`.tar.gz irregex-`cat VERSION` | ||
rm -rf irregex-`cat VERSION` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
|
||
irregex | ||
------- | ||
|
||
Portable Efficient IrRegular Expressions | ||
|
||
http://synthcode.com/scheme/irregex/ | ||
|
||
A fully portable and efficient R[4567]RS implementation of regular | ||
expressions, supporting both POSIX syntax with various (irregular) | ||
PCRE extensions, as well as SCSH's SRE syntax. DFA matching is used | ||
when possible, otherwise a closure-compiled NFA approach is used. | ||
|
||
Documentation is in the file irregex.html. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
0.9.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# These try to test a wide variety of regular expressions which will | ||
# give very different performance when run on DFA matches, | ||
# non-backtracking NFA matchers, and backtracking matchers. The | ||
# Boyer-Moore tests also heuristic matchers, such as CL-PPCRE. | ||
# | ||
# Not checked are multiple backreference patterns (which are NP-hard | ||
# and thus presumably require exponential time). | ||
# | ||
# We still need benchmarks for various special patterns such as | ||
# bos/eos to test how clever IrRegex is about compiling DFAs. | ||
# | ||
# Memory usage is not yet measured. | ||
# | ||
# Column format: | ||
# | ||
# name pattern string prefix compile-repitions search-repetitions | ||
|
||
char literal a a xxxxxxxxxx 1000 10000 | ||
string literal abccb abccb xxxxxxxxxx 1000 10000 | ||
ci string literal (?i:abccb) aBCcB xxxxxxxxxx 1000 10000 | ||
best-case boyer-moore abcdefghijklmnopq abcdefghijklmnopq xxxxxxxxxx 1000 10000 | ||
worst-case boyer-moore abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb bbbbbbbbbb 1000 10000 | ||
alternation (?:asm|break|case|catch|const_cast|continue|default|delete|do|dynamic_cast|else|explicit|export|false|for|friend|goto|if|mutable|namespace|new|operator|private|protected|public|register|reinterpret_cast|return|sizeof|static_cast|switch|template|this|throw|true|try|typedef|typeid|typename|using|virtual|while) virtual aeiouaeiou 1 10000 | ||
basic charsets [[:digit:]][[:alnum:]] 2R abc 1000 10000 | ||
sparse charsets [aeiouAEIOU][aeiouAEIOU][aeiouAEIOU] oEU xxxxx 1000 10000 | ||
|
||
# Some more realistic examples | ||
nonanchored RFC3986 URI regex (([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? http://www.call-with-current-continuation.org/index.html?abc=def#ghi ### 100 1000 | ||
simplified email [[:alnum:]][[:alnum:]?.+_-]*@[[:alnum:]_-]+(\.[[:alnum:]_-]+)*\.[[:alpha:]]+ [email protected] @@@@ 100 10000 | ||
# From http://blogs.sun.com/paulj/entry/email_regex_address_validation_aka | ||
email (")?([[:alnum:]!#$%&'*+/=?^_`{|}~-])+(\.[[:alnum:]!#$%&'*+/=?^_`{|}~-]+)*(")?@[[:alnum:]-]+(\.[[:alnum:]-])*\.? [email protected] @@@@ 100 10000 | ||
|
||
# Expensive examples which may hang or explode | ||
backtracker a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa x 100 100 | ||
expontential dfa a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb b 1 100 | ||
#backtracker + expontential dfa a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] aaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbb b 1 100 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#! /usr/bin/env perl | ||
|
||
use strict; | ||
|
||
sub bench ($$$) { | ||
my ($name, $sub, $n) = @_; | ||
my $start = times; | ||
for (my $i=0; $i<$n; $i++) { $sub->(); } | ||
print "$name: ".((times-$start)*1000)."\n"; | ||
} | ||
|
||
open(IN, "< re-benchmarks.txt"); | ||
while (<IN>) { | ||
next if /^\s*(?:#.*)?$/; | ||
my ($name, $pat, $str, $prefix, $compn, $execn) = split(/\t/); | ||
$pat =~ s{/}{\\/}g; | ||
bench("$name: compile-time", sub {eval "/$pat/"}, $compn); | ||
my ($rx, $rxm, $str2); | ||
eval "\$rx = qr/$pat/"; | ||
eval "\$rxm = qr/^$pat\$/"; | ||
bench("$name: match-time", sub {$str =~ $rxm}, $execn); | ||
for (my $mult=1; $execn>=10; $mult*=10, $execn/=10) { | ||
$str2 = (($prefix x $mult).$str); | ||
bench("$name: search prefix x $mult", sub {$str2 =~ $rx}, $execn); | ||
} | ||
} | ||
close(IN); | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
|
||
(use chicken extras regex data-structures srfi-13) | ||
(include "../irregex") | ||
|
||
(define-syntax time-expr | ||
(syntax-rules () | ||
((time-expr expr) | ||
(let ((start (nth-value 0 (cpu-time)))) | ||
expr | ||
(- (nth-value 0 (cpu-time)) start))))) | ||
|
||
(define (string-replicate str reps) | ||
(let lp ((ls '()) (reps reps)) | ||
(if (<= reps 0) | ||
(string-concatenate-reverse ls) | ||
(lp (cons str ls) (- reps 1))))) | ||
|
||
(define (run-bench name pat str prefix comp-count exec-count) | ||
(let-syntax | ||
((bench (syntax-rules () | ||
((bench variation expr count) | ||
(let ((time-taken | ||
(time-expr (do ((i count (- i 1))) | ||
((< i 0)) | ||
expr)))) | ||
(display name) (display ": ") | ||
(display variation) (display ": ") | ||
(write time-taken) (newline)))))) | ||
(let ((comp-count (string->number comp-count)) | ||
(exec-count (string->number exec-count))) | ||
;; compile time | ||
(bench "compile-time" (string->irregex pat) comp-count) | ||
(let ((irx (string->irregex pat))) | ||
;; match time | ||
(bench "match-time" (irregex-match irx str) exec-count) | ||
;; search times | ||
(let lp ((mult 1) (reps exec-count)) | ||
(cond | ||
((>= reps 10) | ||
(let ((str (string-append (string-replicate prefix mult) str))) | ||
(bench (string-append "search prefix x " (number->string mult)) | ||
(irregex-search irx str) | ||
reps) | ||
(lp (* mult 10) (quotient reps 10)))))))))) | ||
|
||
(call-with-input-file "re-benchmarks.txt" | ||
(lambda (in) | ||
(let lp () | ||
(let ((line (read-line in))) | ||
(cond | ||
((eof-object? line)) | ||
((string-match "^\\s*(?:#.*)?$" line) | ||
(lp)) | ||
(else | ||
(let ((ls (string-split line "\t"))) | ||
(apply run-bench ls) | ||
(lp)))))))) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
|
||
(define (error msg . args) | ||
(display msg) | ||
(for-each (lambda (x) (display " ") (write x)) args) | ||
(newline) | ||
(0)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
|
||
(module irregex | ||
(irregex string->irregex sre->irregex | ||
string->sre maybe-string->sre | ||
irregex? irregex-match-data? | ||
irregex-new-matches irregex-reset-matches! | ||
irregex-search irregex-search/matches irregex-match | ||
irregex-search/chunked irregex-match/chunked make-irregex-chunker | ||
irregex-match-substring irregex-match-subchunk | ||
irregex-match-start-chunk irregex-match-start-index | ||
irregex-match-end-chunk irregex-match-end-index | ||
irregex-match-num-submatches irregex-match-names | ||
irregex-match-valid-index? | ||
irregex-fold irregex-replace irregex-replace/all | ||
irregex-dfa irregex-dfa/search irregex-dfa/extract | ||
irregex-nfa irregex-flags irregex-lengths irregex-names | ||
irregex-num-submatches irregex-extract irregex-split | ||
) | ||
(import scheme) | ||
(import chicken) | ||
(include "irregex.scm") | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
;;; Install irregex-guile.scm to $GUILE_SITE_DIR/rx/irregex.scm | ||
;;; and irregex.scm to $GUILE_SITE_DIR/rx/source/irregex.scm | ||
|
||
(define-module (rx irregex) | ||
#:export (irregex string->irregex sre->irregex string->sre | ||
maybe-string->sre irregex? irregex-match-data? | ||
irregex-new-matches irregex-reset-matches! irregex-search | ||
irregex-search/matches irregex-match | ||
irregex-search/chunked irregex-match/chunked | ||
make-irregex-chunker irregex-match-substring | ||
irregex-match-subchunk irregex-match-start-source | ||
irregex-match-start-index irregex-match-end-source | ||
irregex-match-end-index irregex-match-num-submatches | ||
irregex-match-names irregex-match-valid-index? | ||
irregex-fold irregex-replace irregex-replace/all | ||
irregex-dfa irregex-dfa/search irregex-dfa/extract | ||
irregex-nfa irregex-flags irregex-lengths irregex-names | ||
irregex-num-submatches irregex-extract irregex-split)) | ||
|
||
(load-from-path "rx/source/irregex.scm") |
Oops, something went wrong.