Skip to content

Commit

Permalink
variety bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
PaleNeutron committed Aug 13, 2021
1 parent 09f2e5c commit b299692
Show file tree
Hide file tree
Showing 8 changed files with 32 additions and 30 deletions.
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
tornado==5.0.2
peewee==3.2.2
requests==2.25.1
pycurl==7.43.0.1
sanic==20.12.2
sanic-cors==0.10.0.post3
schedule==0.5.0
six==1.11.0
playwright==1.9.2
tornado==5.1.1
peewee==3.14.4
requests==2.26.0
pycurl==7.44.0
sanic==21.6.2
sanic-cors
schedule==1.1.0
six==1.16.0
playwright==1.13.1
pyquery==1.4.3
2 changes: 1 addition & 1 deletion scylla/providers/free_proxy_list_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def parse(self, document: PyQuery) -> [ProxyIP]:
ip_list: [ProxyIP] = []

for ip_row in document.find('#proxylisttable tbody tr'):
ip_row: PyQuery = ip_row
ip_row: PyQuery = PyQuery(ip_row)
ip_address: str = ip_row.find('td:nth-child(1)').text()
port: str = ip_row.find('td:nth-child(2)').text()

Expand Down
2 changes: 1 addition & 1 deletion scylla/providers/ipaddress_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def parse(self, document: PyQuery) -> [ProxyIP]:
ip_list: [ProxyIP] = []

for ip_row in document.find('.proxylist tbody tr'):
ip_row: PyQuery = ip_row
ip_row: PyQuery = PyQuery(ip_row)
ip_port: str = ip_row.find('td:nth-child(1)').text()
ip_address, port = ip_port.split(":")

Expand Down
16 changes: 8 additions & 8 deletions scylla/providers/proxy_list_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from scylla.database import ProxyIP
from scylla.worker import Worker
from .base_provider import BaseProvider

import urllib.parse

class ProxyListProvider(BaseProvider):

Expand Down Expand Up @@ -35,14 +35,14 @@ def parse(self, document: PyQuery) -> [ProxyIP]:
def urls(self) -> [str]:
ret = []
first_url = 'http://proxy-list.org/english/index.php?p=1'
sub = first_url[0:first_url.rfind('/')] # http://proxy-list.org/english
# sub = first_url[0:first_url.rfind('/')] # http://proxy-list.org/english
first_page = self.w.get_html(first_url, False)

ret.append(first_url)
for a in first_page.find('#content div.content div.table-menu a.item'):
relative_path = a.attrs['href']
absolute_url = sub + relative_path[relative_path.find('/'):]
ret.append(absolute_url)
if first_page:
ret.append(first_url)
for a in first_page.find('#content div.content div.table-menu a.item'):
relative_path = a.attrib['href']
absolute_url = urllib.parse.urljoin(first_url, relative_path)
ret.append(absolute_url)
return ret


Expand Down
2 changes: 1 addition & 1 deletion scylla/providers/proxy_scraper_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def parse(self, document: PyQuery) -> [ProxyIP]:
ip_list: [ProxyIP] = []

text = document.html()
json_object = json.load(text)
json_object = json.loads(text)
if not json_object or type(json_object['usproxy']) != list:
return ip_list

Expand Down
11 changes: 6 additions & 5 deletions scylla/providers/proxylists_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ def urls(self) -> [str]:
ret = set([])
country_url = 'http://www.proxylists.net/countries.html'
country_page = self.w.get_html(country_url, False)
for a in country_page.find('a'):
relative_path = a.attrs['href']
if self.country_patten.match(relative_path) :
ret.update(self.gen_url_for_country(self.country_patten.findall(relative_path)[0]))
break
if country_page:
for a in country_page.find('a'):
relative_path = a.attrs['href']
if self.country_patten.match(relative_path) :
ret.update(self.gen_url_for_country(self.country_patten.findall(relative_path)[0]))
break
return list(ret)

def gen_url_for_country(self, country) -> [str]:
Expand Down
2 changes: 1 addition & 1 deletion scylla/providers/spys_me_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def parse(self, document: PyQuery) -> [ProxyIP]:

text = document.html()

ip_port_str_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}', text.decode('utf-8'))
ip_port_str_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}', text)

for ip_port in ip_port_str_list:

Expand Down
9 changes: 5 additions & 4 deletions scylla/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from multiprocessing import Queue, Process
from multiprocessing import Queue, Process, Manager
from threading import Thread

import schedule
Expand All @@ -22,7 +22,7 @@ def fetch_ips(q: Queue, validator_queue: Queue):

while True:
try:
provider: BaseProvider = q.get()
provider: BaseProvider = q.get()()

provider_name = provider.__class__.__name__

Expand All @@ -32,7 +32,7 @@ def fetch_ips(q: Queue, validator_queue: Queue):
try:
html = worker.get_html(url, render_js=provider.should_render_js())
except Exception as e:
logger.error("worker.get_html failed: ", e)
logger.error("worker.get_html failed: %s", e)
continue

if html:
Expand Down Expand Up @@ -111,6 +111,7 @@ def feed_from_db():
class Scheduler(object):

def __init__(self):
self.manager = Manager()
self.worker_queue = Queue()
self.validator_queue = Queue()
self.worker_process = None
Expand Down Expand Up @@ -157,7 +158,7 @@ def feed_providers(self):
logger.debug('feed {} providers...'.format(len(all_providers)))

for provider in all_providers:
self.worker_queue.put(provider())
self.worker_queue.put(provider)

def stop(self):
self.worker_queue.close()
Expand Down

0 comments on commit b299692

Please sign in to comment.