-
Notifications
You must be signed in to change notification settings - Fork 512
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #112 from scrapinghub/HAR-multi-pages
[MRG] HAR support take 2
- Loading branch information
Showing
27 changed files
with
1,637 additions
and
78 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[submodule "splash/vendor/harviewer"] | ||
path = splash/vendor/harviewer | ||
url = https://github.com/janodvarko/harviewer.git | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,3 +9,5 @@ adblockparser >= 0.2 | |
pyOpenSSL | ||
requests >= 1.0 | ||
Pillow | ||
jsonschema | ||
strict-rfc3339 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,13 @@ | |
import os | ||
import re | ||
|
||
|
||
def _path(*args): | ||
return os.path.join(os.path.dirname(__file__), *args) | ||
|
||
|
||
def get_version(): | ||
filename = os.path.join(os.path.dirname(__file__), 'splash', '__init__.py') | ||
filename = _path('splash', '__init__.py') | ||
with open(filename, 'r') as fp: | ||
contents = fp.read().decode('utf8') | ||
return re.search(r"__version__ = ['\"](.+)['\"]", contents).group(1) | ||
|
@@ -12,7 +17,6 @@ def get_version(): | |
setup_args = { | ||
'name': 'splash', | ||
'version': get_version(), | ||
'packages': ['splash'], | ||
'url': 'https://github.com/scrapinghub/splash', | ||
'description': 'A javascript rendered with a HTTP API', | ||
'long_description': open('README.rst').read(), | ||
|
@@ -22,6 +26,26 @@ def get_version(): | |
'maintainer_email': '[email protected]', | ||
'license': 'BSD', | ||
'scripts': ['bin/splash'], | ||
'packages': ['splash', 'splash.har'], | ||
'package_data': {'splash': [ | ||
'vendor/harviewer/webapp/css/*.css', | ||
'vendor/harviewer/webapp/css/images/*.*', | ||
'vendor/harviewer/webapp/css/images/menu/*.*', | ||
'vendor/harviewer/webapp/scripts/*.*', | ||
'vendor/harviewer/webapp/scripts/core/*.*', | ||
'vendor/harviewer/webapp/scripts/domplate/*.*', | ||
'vendor/harviewer/webapp/scripts/downloadify/js/*.*', | ||
'vendor/harviewer/webapp/scripts/downloadify/src/*.*', | ||
'vendor/harviewer/webapp/scripts/downloadify/media/*.*', | ||
'vendor/harviewer/webapp/scripts/excanvas/*.*', | ||
'vendor/harviewer/webapp/scripts/jquery-plugins/*.*', | ||
'vendor/harviewer/webapp/scripts/json-query/*.*', | ||
'vendor/harviewer/webapp/scripts/nls/*.*', | ||
'vendor/harviewer/webapp/scripts/preview/*.*', | ||
'vendor/harviewer/webapp/scripts/syntax-highlighter/*.js', | ||
'vendor/harviewer/webapp/scripts/tabs/*.*', | ||
'vendor/harviewer/webapp/har.js', | ||
]}, | ||
'classifiers': [ | ||
'Programming Language :: Python', | ||
'Programming Language :: Python :: 2', | ||
|
@@ -41,6 +65,7 @@ def get_version(): | |
except ImportError: | ||
from distutils.core import setup | ||
else: | ||
setup_args['zip_safe'] = False | ||
setup_args['install_requires'] = [ | ||
'Twisted', 'qt4reactor', 'psutil', 'adblockparser' | ||
] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import absolute_import | ||
|
||
from .utils import get_duration, format_datetime |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import absolute_import | ||
from collections import namedtuple | ||
from datetime import datetime | ||
|
||
import splash | ||
from PyQt4.QtCore import PYQT_VERSION_STR, QT_VERSION_STR | ||
from PyQt4.QtWebKit import qWebKitVersion | ||
|
||
from .utils import get_duration, format_datetime, without_private | ||
|
||
|
||
HarEvent = namedtuple('HarEvent', 'type data') | ||
|
||
HAR_ENTRY = 'entry' | ||
HAR_TIMING = 'timing' | ||
HAR_URL_CHANGED = 'urlChanged' | ||
HAR_TITLE_CHANGED = 'titleChanged' | ||
|
||
|
||
class HarLog(object): | ||
""" | ||
Helper class for building HAR data. | ||
""" | ||
|
||
def __init__(self): | ||
self.created_at = datetime.utcnow() | ||
self.network_entries_map = {} # key => network entry map | ||
self.events = [] # all entries in order, including the events | ||
self.pages = None | ||
|
||
def get_mutable_entry(self, req_id, create=False): | ||
""" | ||
Return a dict with HAR entry data. The dict is not a copy; | ||
caller can modify the result and the changes will be kept. | ||
""" | ||
if create: | ||
assert req_id not in self.network_entries_map | ||
entry = {"_idx": req_id} | ||
self.network_entries_map[req_id] = entry | ||
self.events.append(HarEvent(HAR_ENTRY, entry)) | ||
return self.network_entries_map[req_id] | ||
|
||
def store_url(self, url): | ||
""" Call this method when URL is changed. """ | ||
self.events.append(HarEvent(HAR_URL_CHANGED, unicode(url))) | ||
|
||
def store_title(self, title): | ||
""" Call this method when page title is changed. """ | ||
self.events.append(HarEvent(HAR_TITLE_CHANGED, unicode(title))) | ||
|
||
def store_timing(self, name): | ||
""" | ||
Call this method when an event you want to store timing for happened. | ||
""" | ||
self.events.append( | ||
HarEvent(HAR_TIMING, {"name": name, "time": datetime.utcnow()}) | ||
) | ||
|
||
def todict(self): | ||
""" Return HAR log as a Python dict. """ | ||
|
||
# import pprint | ||
# pprint.pprint(self.events) | ||
|
||
self._fill_pages() | ||
|
||
return { | ||
"log": { | ||
"version" : "1.2", | ||
"creator" : { | ||
"name": "Splash", | ||
"version": splash.__version__, | ||
}, | ||
"browser": self._get_browser(), | ||
"entries": self._get_har_entries(), | ||
"pages": self.pages, | ||
} | ||
} | ||
|
||
def _get_browser(self): | ||
return { | ||
"name": "QWebKit", | ||
"version": unicode(qWebKitVersion()), | ||
"comment": "PyQt %s, Qt %s" % (PYQT_VERSION_STR, QT_VERSION_STR), | ||
} | ||
|
||
def _empty_page(self, page_id, started_dt): | ||
if not isinstance(started_dt, basestring): | ||
started_dt = format_datetime(started_dt) | ||
|
||
return { | ||
"id": str(page_id), | ||
"title": "[no title]", | ||
"startedDateTime": started_dt, | ||
"pageTimings": { | ||
"onContentLoad": -1, | ||
"onLoad": -1, | ||
} | ||
} | ||
|
||
def _fill_pages(self): | ||
page_id = 1 | ||
started_dt = self.created_at | ||
current_page = self._empty_page(page_id, started_dt) | ||
first_page = True | ||
|
||
self.pages = [current_page] | ||
|
||
for idx, ev in enumerate(self.events): | ||
if ev.type == HAR_TIMING: | ||
name = ev.data["name"] | ||
time = get_duration(started_dt, ev.data["time"]) | ||
current_page["pageTimings"][name] = time | ||
|
||
elif ev.type == HAR_TITLE_CHANGED: | ||
current_page["title"] = ev.data | ||
|
||
elif ev.type == HAR_ENTRY: | ||
ev.data["pageref"] = str(page_id) | ||
|
||
elif ev.type == HAR_URL_CHANGED: | ||
# We need to find a network entry which caused URL | ||
# to change - it belongs to this new page. | ||
cause_ev = self._prev_entry(ev.data, idx) | ||
if first_page: | ||
first_page = False | ||
else: | ||
# Start a new page. | ||
page_id += 1 | ||
started_dt = cause_ev.data['_tmp']['start_time'] | ||
current_page = self._empty_page(page_id, started_dt) | ||
self.pages.append(current_page) | ||
cause_ev.data["pageref"] = str(page_id) | ||
|
||
def _prev_entry(self, url, last_idx): | ||
for ev in reversed(self.events[:last_idx]): | ||
if ev.type != HAR_ENTRY: | ||
continue | ||
if ev.data["request"]["url"] == url: | ||
return ev | ||
|
||
def _get_har_entries(self): | ||
return [ | ||
without_private(e.data) | ||
for e in self.events | ||
if e.type == HAR_ENTRY | ||
] |
Oops, something went wrong.