Skip to content

Commit

Permalink
http-url fix to support multiple directories in the path
Browse files Browse the repository at this point in the history
  • Loading branch information
ashinn committed Aug 15, 2012
0 parents commit 3de590e
Show file tree
Hide file tree
Showing 26 changed files with 7,339 additions and 0 deletions.
27 changes: 27 additions & 0 deletions .hgignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
syntax: glob
*.orig
*.rej
*~
*.c
*.o
*.so
*.aux
*.log
*.lof
*.lot
*.pdf
*.elc
*.err
*.exports
PROFILE.*
*.tar.bz2
*.tar.gz
*.tgz
.Z-*.scm
*cl-ppcre*
regex-dna*
regexdna*

syntax: regexp
.*\#.*\#$

51 changes: 51 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

.PHONY: all clean distclean test doc

all: doc

# these crazy make hacks will go away with the 0.8 refactoring

R6RS_DEFINITIONS=find|filter|remove|unicode-.*|char->utf8-list|cset->utf8-pattern

irregex-r6rs.scm: irregex.scm Makefile
perl -e 'BEGIN{$$/=""}' -ane "s/^(\\(define \\*allow-utf8-mode\\?\\* +#).(.*)/\\1f\\2/sm; s/^(\\(define \([-\\w]*utf8[-\\w]* ([-\\w]+) [^)]*\)).*/\\1 \\2)\n\n/sm; print unless /^\(define +\(?($(R6RS_DEFINITIONS))\\s/" < $< > $@

irregex-stalin.scm: irregex.scm regex-dna.scm read-string.scm error.scm
cat $^ | grep -v '^(use ' > $@

irregex-jazz.scm: irregex.scm Makefile
perl -e 'BEGIN{$$/=""}' -e 'END{print "\n)\n"}' -ape "s/^(\\(define \\*allow-utf8-mode\\?\\* +#).(.*)/\\1f\\2/sm; s/^(\\(define irregex-tag .*)/(unit irregex.implementation.irregex\n\n(declare (proper-tail-calls) (block) (fixnum) (inline) (inlining-limit 700) (standard-bindings) (extended-bindings))\n\n\\1/sm; s/^\\(define \\(?integer-log .*/(define (integer-log n) (if (zero? n) n (- (integer-length n) 1)))\n\n/sm; s/^\\(define \\(bit-.*//sm; s/\\bbit-(not|ior|and)\\b/bitwise-\\1/gsm; s/\\bbit-shl\\b/fxarithmetic-shift-left/gsm; s/\\bbit-shr\\b/fxarithmetic-shift-right/gsm; if (/^\\(define .*multi-state/) {s/24/16/gsm; s/\\b(vector)\\b/u16\\1/gsm; s/\\(u16vector-ref mst 0\\)/(equal?-hash mst)/gsm; }" < $< > $@

irregex-base.html: irregex.doc irregex.mistie irregex.css
csi -R mistie -e '(mistie-load "plain.mistie")' \
-e '(mistie-load "scmhilit.mistie")' \
-e '(define h-page-count 0)' \
-e '(mistie-load "xref.mistie")' \
-e '(mistie-load "timestamp.mistie")' \
-e '(mistie-load "irregex.mistie")' \
-e '(mistie-main "$<")' > $@

irregex.html: irregex-base.html
grep '^<a name="SECTION_' $< |\
perl -ape 's{<a name="(SECTION_(\d+)(\.\d+)?)"><h[12]>(?:[.\d]+)(?:\s|&nbsp;)*([^<>]*).*}{($$3?($$3==.1?($$sub=1,"<ol>\n"):""):(($$x=$$sub,$$sub=0,$$x>0)?"</ol>\n":""))."<li><a href=\"#$$1\">$$4</a>"}ge;' > irregex-toc.html
perl -ape 's{^<!--\s*TOC\s*-->}{"<ol>\n".`cat irregex-toc.html`."</ol>"}e' $< > $@
rm -f $< irregex-toc.html

doc: irregex.html

test:
csi -script test-all.scm

clean:
rm -f *~ */*~ *.so

distclean: clean
rm -f *.html

dist: doc
rm -f irregex-`cat VERSION`.tar.gz
mkdir irregex-`cat VERSION`
for f in `hg manifest`; do mkdir -p irregex-`cat VERSION`/`dirname $$f`; ln -s `pwd`/$$f irregex-`cat VERSION`/$$f; done
cd irregex-`cat VERSION`; for f in `echo ../*.html`; do ln -s $$f; done; cd ..
tar cphzvf irregex-`cat VERSION`.tar.gz irregex-`cat VERSION`
rm -rf irregex-`cat VERSION`
14 changes: 14 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

irregex
-------

Portable Efficient IrRegular Expressions

http://synthcode.com/scheme/irregex/

A fully portable and efficient R[4567]RS implementation of regular
expressions, supporting both POSIX syntax with various (irregular)
PCRE extensions, as well as SCSH's SRE syntax. DFA matching is used
when possible, otherwise a closure-compiled NFA approach is used.

Documentation is in the file irregex.html.
1 change: 1 addition & 0 deletions VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.9.0
36 changes: 36 additions & 0 deletions benchmarks/re-benchmarks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# These try to test a wide variety of regular expressions which will
# give very different performance when run on DFA matches,
# non-backtracking NFA matchers, and backtracking matchers. The
# Boyer-Moore tests also heuristic matchers, such as CL-PPCRE.
#
# Not checked are multiple backreference patterns (which are NP-hard
# and thus presumably require exponential time).
#
# We still need benchmarks for various special patterns such as
# bos/eos to test how clever IrRegex is about compiling DFAs.
#
# Memory usage is not yet measured.
#
# Column format:
#
# name pattern string prefix compile-repitions search-repetitions

char literal a a xxxxxxxxxx 1000 10000
string literal abccb abccb xxxxxxxxxx 1000 10000
ci string literal (?i:abccb) aBCcB xxxxxxxxxx 1000 10000
best-case boyer-moore abcdefghijklmnopq abcdefghijklmnopq xxxxxxxxxx 1000 10000
worst-case boyer-moore abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb bbbbbbbbbb 1000 10000
alternation (?:asm|break|case|catch|const_cast|continue|default|delete|do|dynamic_cast|else|explicit|export|false|for|friend|goto|if|mutable|namespace|new|operator|private|protected|public|register|reinterpret_cast|return|sizeof|static_cast|switch|template|this|throw|true|try|typedef|typeid|typename|using|virtual|while) virtual aeiouaeiou 1 10000
basic charsets [[:digit:]][[:alnum:]] 2R abc 1000 10000
sparse charsets [aeiouAEIOU][aeiouAEIOU][aeiouAEIOU] oEU xxxxx 1000 10000

# Some more realistic examples
nonanchored RFC3986 URI regex (([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? http://www.call-with-current-continuation.org/index.html?abc=def#ghi ### 100 1000
simplified email [[:alnum:]][[:alnum:]?.+_-]*@[[:alnum:]_-]+(\.[[:alnum:]_-]+)*\.[[:alpha:]]+ [email protected] @@@@ 100 10000
# From http://blogs.sun.com/paulj/entry/email_regex_address_validation_aka
email (")?([[:alnum:]!#$%&'*+/=?^_`{|}~-])+(\.[[:alnum:]!#$%&'*+/=?^_`{|}~-]+)*(")?@[[:alnum:]-]+(\.[[:alnum:]-])*\.? [email protected] @@@@ 100 10000

# Expensive examples which may hang or explode
backtracker a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa x 100 100
expontential dfa a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb b 1 100
#backtracker + expontential dfa a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] aaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbb b 1 100
29 changes: 29 additions & 0 deletions benchmarks/run-bench.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#! /usr/bin/env perl

use strict;

sub bench ($$$) {
my ($name, $sub, $n) = @_;
my $start = times;
for (my $i=0; $i<$n; $i++) { $sub->(); }
print "$name: ".((times-$start)*1000)."\n";
}

open(IN, "< re-benchmarks.txt");
while (<IN>) {
next if /^\s*(?:#.*)?$/;
my ($name, $pat, $str, $prefix, $compn, $execn) = split(/\t/);
$pat =~ s{/}{\\/}g;
bench("$name: compile-time", sub {eval "/$pat/"}, $compn);
my ($rx, $rxm, $str2);
eval "\$rx = qr/$pat/";
eval "\$rxm = qr/^$pat\$/";
bench("$name: match-time", sub {$str =~ $rxm}, $execn);
for (my $mult=1; $execn>=10; $mult*=10, $execn/=10) {
$str2 = (($prefix x $mult).$str);
bench("$name: search prefix x $mult", sub {$str2 =~ $rx}, $execn);
}
}
close(IN);


58 changes: 58 additions & 0 deletions benchmarks/run-bench.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

(use chicken extras regex data-structures srfi-13)
(include "../irregex")

(define-syntax time-expr
(syntax-rules ()
((time-expr expr)
(let ((start (nth-value 0 (cpu-time))))
expr
(- (nth-value 0 (cpu-time)) start)))))

(define (string-replicate str reps)
(let lp ((ls '()) (reps reps))
(if (<= reps 0)
(string-concatenate-reverse ls)
(lp (cons str ls) (- reps 1)))))

(define (run-bench name pat str prefix comp-count exec-count)
(let-syntax
((bench (syntax-rules ()
((bench variation expr count)
(let ((time-taken
(time-expr (do ((i count (- i 1)))
((< i 0))
expr))))
(display name) (display ": ")
(display variation) (display ": ")
(write time-taken) (newline))))))
(let ((comp-count (string->number comp-count))
(exec-count (string->number exec-count)))
;; compile time
(bench "compile-time" (string->irregex pat) comp-count)
(let ((irx (string->irregex pat)))
;; match time
(bench "match-time" (irregex-match irx str) exec-count)
;; search times
(let lp ((mult 1) (reps exec-count))
(cond
((>= reps 10)
(let ((str (string-append (string-replicate prefix mult) str)))
(bench (string-append "search prefix x " (number->string mult))
(irregex-search irx str)
reps)
(lp (* mult 10) (quotient reps 10))))))))))

(call-with-input-file "re-benchmarks.txt"
(lambda (in)
(let lp ()
(let ((line (read-line in)))
(cond
((eof-object? line))
((string-match "^\\s*(?:#.*)?$" line)
(lp))
(else
(let ((ls (string-split line "\t")))
(apply run-bench ls)
(lp))))))))

7 changes: 7 additions & 0 deletions error.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

(define (error msg . args)
(display msg)
(for-each (lambda (x) (display " ") (write x)) args)
(newline)
(0))

22 changes: 22 additions & 0 deletions irregex-chicken.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

(module irregex
(irregex string->irregex sre->irregex
string->sre maybe-string->sre
irregex? irregex-match-data?
irregex-new-matches irregex-reset-matches!
irregex-search irregex-search/matches irregex-match
irregex-search/chunked irregex-match/chunked make-irregex-chunker
irregex-match-substring irregex-match-subchunk
irregex-match-start-chunk irregex-match-start-index
irregex-match-end-chunk irregex-match-end-index
irregex-match-num-submatches irregex-match-names
irregex-match-valid-index?
irregex-fold irregex-replace irregex-replace/all
irregex-dfa irregex-dfa/search irregex-dfa/extract
irregex-nfa irregex-flags irregex-lengths irregex-names
irregex-num-submatches irregex-extract irregex-split
)
(import scheme)
(import chicken)
(include "irregex.scm")
)
20 changes: 20 additions & 0 deletions irregex-guile.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
;;; Install irregex-guile.scm to $GUILE_SITE_DIR/rx/irregex.scm
;;; and irregex.scm to $GUILE_SITE_DIR/rx/source/irregex.scm

(define-module (rx irregex)
#:export (irregex string->irregex sre->irregex string->sre
maybe-string->sre irregex? irregex-match-data?
irregex-new-matches irregex-reset-matches! irregex-search
irregex-search/matches irregex-match
irregex-search/chunked irregex-match/chunked
make-irregex-chunker irregex-match-substring
irregex-match-subchunk irregex-match-start-source
irregex-match-start-index irregex-match-end-source
irregex-match-end-index irregex-match-num-submatches
irregex-match-names irregex-match-valid-index?
irregex-fold irregex-replace irregex-replace/all
irregex-dfa irregex-dfa/search irregex-dfa/extract
irregex-nfa irregex-flags irregex-lengths irregex-names
irregex-num-submatches irregex-extract irregex-split))

(load-from-path "rx/source/irregex.scm")
Loading

0 comments on commit 3de590e

Please sign in to comment.