1.0.0-beta

s0md3v · Apr 4, 2023 · 3513066 · 3513066
1 parent 6122498
commit 3513066
Show file tree

Hide file tree

Showing 4 changed files with 220 additions and 83 deletions.
diff --git a/uro/__init__.py b/uro/__init__.py
@@ -1 +1 @@
-__version__ = '0.0.5'
+__version__ = '1.0.0-beta'
diff --git a/uro/filters.py b/uro/filters.py
@@ -0,0 +1,63 @@
+import re
+
+re_content = re.compile(r'(post|blog)s?|docs|support/|/(\d{4}|pages?)/\d+/')
+
+def check_ext(path, exts):
+	"""
+	checks if a url has an extension and if it's in the given list
+	"""
+	if '.' not in path.split('/')[-1]:
+		return False, False
+	return True, path.lower().endswith(tuple(exts))
+
+def has_ext(path, params, meta):
+	"""
+	returns True if url has no extension e.g. example.com/about-us/team
+	"""
+	has_ext, _ = check_ext(path, [])
+	return has_ext
+
+def no_ext(path, params, meta):
+	"""
+	returns True if url has no extension e.g. example.com/about-us/team
+	"""
+	has_ext, _ = check_ext(path, [])
+	return not has_ext
+
+def has_params(path, params, meta):
+	"""
+	returns True if url has parameters
+	"""
+	return len(params) > 0
+
+def no_params(path, params, meta):
+	"""
+	returns True if url has no parameters
+	"""
+	return len(params) == 0
+
+def whitelisted(path, params, meta):
+	"""
+	returns True if url has no extension or has a whitelisted extension
+	"""
+	has_ext, is_ext = check_ext(path, meta['ext_list'])
+	return is_ext or (not meta['strict'] and not has_ext)
+
+def blacklisted(path, params, meta):
+	"""
+	returns True if url has no extension or doesn't have a blacklisted extension
+	"""
+	has_ext, is_ext = check_ext(path, meta['ext_list'])
+	return not is_ext or (not meta['strict'] and not has_ext)
+
+def remove_content(path, params, meta):
+	"""
+	checks if a path is likely to contain
+	human written content e.g. a blog
+
+	returns False if it is
+	"""
+	for part in path.split('/'):
+		if part.count('-') > 3:
+			return False
+	return False if re_content.search(path) else True
diff --git a/uro/uro.py b/uro/uro.py
@@ -1,71 +1,55 @@
+import argparse
 import re
 import sys
 from urllib.parse import urlparse
+
+from uro.utils import *
+from uro.filters import *
+
 try:
 	from signal import signal, SIGPIPE, SIG_DFL
 	signal(SIGPIPE, SIG_DFL)
 except ImportError:
-        pass
+	pass
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-i', help='file containing urls', dest='input_file')
+parser.add_argument('-o', help='output file', dest='output_file')
+parser.add_argument('-w', '--whitelist', help='only keep these extension and extension-less urls', dest='whitelist', nargs='+')
+parser.add_argument('-b', '--blacklist', help='remove these extensions', dest='blacklist', nargs='+')
+parser.add_argument('-f', '--filters', help='additional filters, read docs', dest='filters', nargs='+')
+args = parser.parse_args()
+
+filters = clean_nargs(args.filters)
+active_filters = ['removecontent']
+
+if not args.whitelist or "allexts" in filters:
+	active_filters.append('blacklist')
+if args.whitelist:
+	active_filters.append('whitelist')
+
+active_filters.extend(filters)
+
+if 'keepcontent' in active_filters:
+	active_filters.remove('removecontent')
+	active_filters.remove('keepcontent')
 
 urlmap = {}
 params_seen = []
 patterns_seen = []
 
 re_int = re.compile(r'/\d+([?/]|$)')
-re_content = re.compile(r'(post|blog)s?|docs|support/|/(\d{4}|pages?)/\d+/')
-static_exts = ('js', 'css', 'png', 'jpg', 'jpeg', 'svg',
+
+ext_list = clean_nargs(args.blacklist) if args.blacklist else ('js', 'css', 'png', 'jpg', 'jpeg', 'svg',
 	'ico','webp', 'ttf', 'otf', 'woff', 'gif',
 	'pdf', 'bmp', 'eot', 'mp3', 'woff2', 'mp4', 'avi'
 )
 
-
-def params_to_dict(params: str) -> list:
-	"""
-	converts query string to dict
-	"""
-	the_dict = {}
-	if params:
-		for pair in params.split('&'):
-			parts = pair.split('=')
-			try:
-				the_dict[parts[0]] = parts[1]
-			except IndexError:
-				pass
-	return the_dict
-
-
-def dict_to_params(params: dict) -> str:
-	"""
-	converts dict of params to query string
-	"""
-	stringed = [name + '=' + value for name, value in params.items()]
-	return '?' + '&'.join(stringed)
-
-
-def compare_params(og_params: list, new_params: dict) -> bool:
-	"""
-	checks if new_params contain a param
-	that doesn't exist in og_params
-	"""
-	og_set = set([])
-	for each in og_params:
-		for key in each.keys():
-			og_set.add(key)
-	return set(new_params.keys()) - og_set
-
-
-def is_content(path: str) -> bool:
-	"""
-	checks if a path is likely to contain
-	human written content e.g. a blog
-	"""
-	for part in path.split('/'):
-		if part.count('-') > 3:
-			return True
-	return False
+if args.whitelist:
+	ext_list = clean_nargs(args.whitelist)
 
 
-def create_pattern(path: str) -> str:
+def create_pattern(path):
 	"""
 	creates patterns for urls with integers in them
 	"""
@@ -78,7 +62,7 @@ def create_pattern(path: str) -> str:
 	return '/'.join(new_parts)
 
 
-def pattern_exists(pattern: str) -> bool:
+def pattern_exists(pattern):
 	"""
 	checks if a int pattern exists
 	"""
@@ -91,7 +75,7 @@ def pattern_exists(pattern: str) -> bool:
 	return False
 
 
-def matches_patterns(path: str) -> bool:
+def matches_patterns(path):
 	"""
 	checks if the url matches any of the int patterns
 	"""
@@ -100,15 +84,7 @@ def matches_patterns(path: str) -> bool:
 			return True
 	return False
 
-
-def has_bad_ext(path: str) -> bool:
-	"""
-	checks if a url has a blacklisted extension
-	"""
-	return False if '/' in path.split('.')[-1] else path.lower().endswith(static_exts)
-
-
-def is_new_param(params: list) -> bool:
+def is_new_param(params):
 	"""
 	checks if a there's an unseen param within given params
 	"""
@@ -118,29 +94,71 @@ def is_new_param(params: list) -> bool:
 	return True
 
 
+def apply_filters(path, params):
+	"""
+	apply filters to a url
+	returns True if the url should be kept
+	"""
+	filter_map = {
+		'hasext': has_ext,
+		'noext': no_ext,
+		'hasparams': has_params,
+		'noparams': no_params,
+		'removecontent': remove_content,
+		'blacklist': blacklisted,
+		'whitelist': whitelisted,
+	}
+	results = []
+	meta = {
+		'strict': True if ('hasext' or 'noext') in filters else False,
+		'ext_list': ext_list,
+	}
+	for filter in active_filters:
+		if filter in filter_map:
+			if not filter_map[filter](path, params, meta):
+				return False
+	return True
+
+
+def process_url(url):
+	"""
+	processes a url
+	"""
+	host = url.scheme + '://' + url.netloc
+	if host not in urlmap:
+		urlmap[host] = {}
+	path, params = url.path, params_to_dict(url.query)
+	has_new_param = False if not params else is_new_param(params.keys())
+	new_params = [param for param in params.keys() if param not in params_seen]
+	params_seen.extend(new_params)
+	if (not params or has_new_param) and re_int.search(path):
+		pattern = create_pattern(path)
+		if not pattern_exists(pattern):
+			patterns_seen.append(pattern)
+		elif matches_patterns(path):
+			return
+	keep_url = apply_filters(path, params)
+	if keep_url:
+		if path not in urlmap[host]:
+			urlmap[host][path] = [params] if params else []
+		elif has_new_param or compare_params(urlmap[host][path], params):
+			urlmap[host][path].append(params)
+
 def main():
-	if not sys.stdin.isatty():
-		for line in sys.stdin:
-			parsed = urlparse(line.strip())
-			host = parsed.scheme + '://' + parsed.netloc
-			if host not in urlmap:
-				urlmap[host] = {}
-			path, params = parsed.path, params_to_dict(parsed.query)
-			has_new_param = False if not params else is_new_param(params.keys())
-			new_params = [param for param in params.keys() if param not in params_seen]
-			params_seen.extend(new_params)
-			if has_bad_ext(path) or re_content.search(path) or is_content(path):
-				continue
-			if (not params or has_new_param) and re_int.search(path):
-				pattern = create_pattern(path)
-				if not pattern_exists(pattern):
-					patterns_seen.append(pattern)
-				elif matches_patterns(path):
-					continue
-			if path not in urlmap[host]:
-				urlmap[host][path] = [params] if params else []
-			elif has_new_param or compare_params(urlmap[host][path], params):
-				urlmap[host][path].append(params)
+	input_stream = open(args.input_file, 'r') if args.input_file else None
+	if not input_stream:
+		if not sys.stdin.isatty():
+			input_stream = sys.stdin
+	if not input_stream:
+		print('[ERROR] No input file or stdin.', file=sys.stderr)
+		exit(1)
+	for line in input_stream:
+		cleanline = line.strip() if 'keepslash' in filters else line.strip().rstrip('/')
+		parsed_url = urlparse(cleanline)
+		if parsed_url.netloc:
+			process_url(parsed_url)
+	og_stdout = sys.stdout
+	sys.stdout = open(args.output_file, 'a+') if args.output_file else sys.stdout
 	for host, value in urlmap.items():
 		for path, params in value.items():
 			if params:

diff --git a/uro/utils.py b/uro/utils.py
@@ -0,0 +1,56 @@
+def clean_nargs(args):
+	"""
+	cleans nargs to prevent user errors
+	"""
+	if not args:
+		return []
+	new_args = []
+	if len(args) == 1:
+		if "," in args[0]:
+			new_args = [arg.lower() for arg in args[0].strip().split(',')]
+		elif " " in args[0]:
+			new_args = [arg.lower() for arg in args[0].split(' ')]
+		else:
+			new_args.append(args[0].lower())
+	else:
+		for arg in args:
+			cleaner = clean_nargs([arg])
+			if cleaner:
+				new_args.extend(cleaner)
+			else:
+				new_args.append(arg)
+	return list(set(filter(None, new_args)))
+
+def params_to_dict(params):
+	"""
+	converts query string to dict
+	"""
+	the_dict = {}
+	if params:
+		for pair in params.split('&'):
+			parts = pair.split('=')
+			try:
+				the_dict[parts[0]] = parts[1]
+			except IndexError:
+				pass
+	return the_dict
+
+
+def dict_to_params(params):
+	"""
+	converts dict of params to query string
+	"""
+	stringed = [name + '=' + value for name, value in params.items()]
+	return '?' + '&'.join(stringed)
+
+
+def compare_params(og_params, new_params):
+	"""
+	checks if new_params contain a param
+	that doesn't exist in og_params
+	"""
+	og_set = set([])
+	for each in og_params:
+		for key in each.keys():
+			og_set.add(key)
+	return set(new_params.keys()) - og_set