Skip to content

Commit

Permalink
1.0.0-beta
Browse files Browse the repository at this point in the history
  • Loading branch information
s0md3v committed Apr 4, 2023
1 parent 6122498 commit 3513066
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 83 deletions.
2 changes: 1 addition & 1 deletion uro/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.5'
__version__ = '1.0.0-beta'
63 changes: 63 additions & 0 deletions uro/filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re

re_content = re.compile(r'(post|blog)s?|docs|support/|/(\d{4}|pages?)/\d+/')

def check_ext(path, exts):
"""
checks if a url has an extension and if it's in the given list
"""
if '.' not in path.split('/')[-1]:
return False, False
return True, path.lower().endswith(tuple(exts))

def has_ext(path, params, meta):
"""
returns True if url has no extension e.g. example.com/about-us/team
"""
has_ext, _ = check_ext(path, [])
return has_ext

def no_ext(path, params, meta):
"""
returns True if url has no extension e.g. example.com/about-us/team
"""
has_ext, _ = check_ext(path, [])
return not has_ext

def has_params(path, params, meta):
"""
returns True if url has parameters
"""
return len(params) > 0

def no_params(path, params, meta):
"""
returns True if url has no parameters
"""
return len(params) == 0

def whitelisted(path, params, meta):
"""
returns True if url has no extension or has a whitelisted extension
"""
has_ext, is_ext = check_ext(path, meta['ext_list'])
return is_ext or (not meta['strict'] and not has_ext)

def blacklisted(path, params, meta):
"""
returns True if url has no extension or doesn't have a blacklisted extension
"""
has_ext, is_ext = check_ext(path, meta['ext_list'])
return not is_ext or (not meta['strict'] and not has_ext)

def remove_content(path, params, meta):
"""
checks if a path is likely to contain
human written content e.g. a blog
returns False if it is
"""
for part in path.split('/'):
if part.count('-') > 3:
return False
return False if re_content.search(path) else True
182 changes: 100 additions & 82 deletions uro/uro.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,55 @@
import argparse
import re
import sys
from urllib.parse import urlparse

from uro.utils import *
from uro.filters import *

try:
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)
except ImportError:
pass
pass

parser = argparse.ArgumentParser()
parser.add_argument('-i', help='file containing urls', dest='input_file')
parser.add_argument('-o', help='output file', dest='output_file')
parser.add_argument('-w', '--whitelist', help='only keep these extension and extension-less urls', dest='whitelist', nargs='+')
parser.add_argument('-b', '--blacklist', help='remove these extensions', dest='blacklist', nargs='+')
parser.add_argument('-f', '--filters', help='additional filters, read docs', dest='filters', nargs='+')
args = parser.parse_args()

filters = clean_nargs(args.filters)
active_filters = ['removecontent']

if not args.whitelist or "allexts" in filters:
active_filters.append('blacklist')
if args.whitelist:
active_filters.append('whitelist')

active_filters.extend(filters)

if 'keepcontent' in active_filters:
active_filters.remove('removecontent')
active_filters.remove('keepcontent')

urlmap = {}
params_seen = []
patterns_seen = []

re_int = re.compile(r'/\d+([?/]|$)')
re_content = re.compile(r'(post|blog)s?|docs|support/|/(\d{4}|pages?)/\d+/')
static_exts = ('js', 'css', 'png', 'jpg', 'jpeg', 'svg',

ext_list = clean_nargs(args.blacklist) if args.blacklist else ('js', 'css', 'png', 'jpg', 'jpeg', 'svg',
'ico','webp', 'ttf', 'otf', 'woff', 'gif',
'pdf', 'bmp', 'eot', 'mp3', 'woff2', 'mp4', 'avi'
)


def params_to_dict(params: str) -> list:
"""
converts query string to dict
"""
the_dict = {}
if params:
for pair in params.split('&'):
parts = pair.split('=')
try:
the_dict[parts[0]] = parts[1]
except IndexError:
pass
return the_dict


def dict_to_params(params: dict) -> str:
"""
converts dict of params to query string
"""
stringed = [name + '=' + value for name, value in params.items()]
return '?' + '&'.join(stringed)


def compare_params(og_params: list, new_params: dict) -> bool:
"""
checks if new_params contain a param
that doesn't exist in og_params
"""
og_set = set([])
for each in og_params:
for key in each.keys():
og_set.add(key)
return set(new_params.keys()) - og_set


def is_content(path: str) -> bool:
"""
checks if a path is likely to contain
human written content e.g. a blog
"""
for part in path.split('/'):
if part.count('-') > 3:
return True
return False
if args.whitelist:
ext_list = clean_nargs(args.whitelist)


def create_pattern(path: str) -> str:
def create_pattern(path):
"""
creates patterns for urls with integers in them
"""
Expand All @@ -78,7 +62,7 @@ def create_pattern(path: str) -> str:
return '/'.join(new_parts)


def pattern_exists(pattern: str) -> bool:
def pattern_exists(pattern):
"""
checks if a int pattern exists
"""
Expand All @@ -91,7 +75,7 @@ def pattern_exists(pattern: str) -> bool:
return False


def matches_patterns(path: str) -> bool:
def matches_patterns(path):
"""
checks if the url matches any of the int patterns
"""
Expand All @@ -100,15 +84,7 @@ def matches_patterns(path: str) -> bool:
return True
return False


def has_bad_ext(path: str) -> bool:
"""
checks if a url has a blacklisted extension
"""
return False if '/' in path.split('.')[-1] else path.lower().endswith(static_exts)


def is_new_param(params: list) -> bool:
def is_new_param(params):
"""
checks if a there's an unseen param within given params
"""
Expand All @@ -118,29 +94,71 @@ def is_new_param(params: list) -> bool:
return True


def apply_filters(path, params):
"""
apply filters to a url
returns True if the url should be kept
"""
filter_map = {
'hasext': has_ext,
'noext': no_ext,
'hasparams': has_params,
'noparams': no_params,
'removecontent': remove_content,
'blacklist': blacklisted,
'whitelist': whitelisted,
}
results = []
meta = {
'strict': True if ('hasext' or 'noext') in filters else False,
'ext_list': ext_list,
}
for filter in active_filters:
if filter in filter_map:
if not filter_map[filter](path, params, meta):
return False
return True


def process_url(url):
"""
processes a url
"""
host = url.scheme + '://' + url.netloc
if host not in urlmap:
urlmap[host] = {}
path, params = url.path, params_to_dict(url.query)
has_new_param = False if not params else is_new_param(params.keys())
new_params = [param for param in params.keys() if param not in params_seen]
params_seen.extend(new_params)
if (not params or has_new_param) and re_int.search(path):
pattern = create_pattern(path)
if not pattern_exists(pattern):
patterns_seen.append(pattern)
elif matches_patterns(path):
return
keep_url = apply_filters(path, params)
if keep_url:
if path not in urlmap[host]:
urlmap[host][path] = [params] if params else []
elif has_new_param or compare_params(urlmap[host][path], params):
urlmap[host][path].append(params)

def main():
if not sys.stdin.isatty():
for line in sys.stdin:
parsed = urlparse(line.strip())
host = parsed.scheme + '://' + parsed.netloc
if host not in urlmap:
urlmap[host] = {}
path, params = parsed.path, params_to_dict(parsed.query)
has_new_param = False if not params else is_new_param(params.keys())
new_params = [param for param in params.keys() if param not in params_seen]
params_seen.extend(new_params)
if has_bad_ext(path) or re_content.search(path) or is_content(path):
continue
if (not params or has_new_param) and re_int.search(path):
pattern = create_pattern(path)
if not pattern_exists(pattern):
patterns_seen.append(pattern)
elif matches_patterns(path):
continue
if path not in urlmap[host]:
urlmap[host][path] = [params] if params else []
elif has_new_param or compare_params(urlmap[host][path], params):
urlmap[host][path].append(params)
input_stream = open(args.input_file, 'r') if args.input_file else None
if not input_stream:
if not sys.stdin.isatty():
input_stream = sys.stdin
if not input_stream:
print('[ERROR] No input file or stdin.', file=sys.stderr)
exit(1)
for line in input_stream:
cleanline = line.strip() if 'keepslash' in filters else line.strip().rstrip('/')
parsed_url = urlparse(cleanline)
if parsed_url.netloc:
process_url(parsed_url)
og_stdout = sys.stdout
sys.stdout = open(args.output_file, 'a+') if args.output_file else sys.stdout
for host, value in urlmap.items():
for path, params in value.items():
if params:
Expand Down
56 changes: 56 additions & 0 deletions uro/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
def clean_nargs(args):
"""
cleans nargs to prevent user errors
"""
if not args:
return []
new_args = []
if len(args) == 1:
if "," in args[0]:
new_args = [arg.lower() for arg in args[0].strip().split(',')]
elif " " in args[0]:
new_args = [arg.lower() for arg in args[0].split(' ')]
else:
new_args.append(args[0].lower())
else:
for arg in args:
cleaner = clean_nargs([arg])
if cleaner:
new_args.extend(cleaner)
else:
new_args.append(arg)
return list(set(filter(None, new_args)))

def params_to_dict(params):
"""
converts query string to dict
"""
the_dict = {}
if params:
for pair in params.split('&'):
parts = pair.split('=')
try:
the_dict[parts[0]] = parts[1]
except IndexError:
pass
return the_dict


def dict_to_params(params):
"""
converts dict of params to query string
"""
stringed = [name + '=' + value for name, value in params.items()]
return '?' + '&'.join(stringed)


def compare_params(og_params, new_params):
"""
checks if new_params contain a param
that doesn't exist in og_params
"""
og_set = set([])
for each in og_params:
for key in each.keys():
og_set.add(key)
return set(new_params.keys()) - og_set

0 comments on commit 3513066

Please sign in to comment.