Skip to content

Commit

Permalink
Merge pull request #112 from scrapinghub/HAR-multi-pages
Browse files Browse the repository at this point in the history
[MRG] HAR support take 2
  • Loading branch information
pablohoffman committed Oct 3, 2014
2 parents e604309 + bac876b commit 3135d77
Show file tree
Hide file tree
Showing 27 changed files with 1,637 additions and 78 deletions.
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "splash/vendor/harviewer"]
path = splash/vendor/harviewer
url = https://github.com/janodvarko/harviewer.git

4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ virtualenv:

before_install:
- sudo apt-get install python-twisted python-qt4
- pip install qt4reactor psutil requests Pillow
- "if [[ $SYSTEM_TWISTED == 'false' ]]; then pip install -UI twisted; fi"
- pip install qt4reactor psutil requests Pillow jsonschema strict-rfc3339
- "if [[ $SYSTEM_TWISTED == 'false' ]]; then pip install -UI twisted service_identity; fi"

install:
- python setup.py install
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ include requirements.txt

recursive-include docs *.rst
recursive-include splash/tests *.txt *.js *.ini
recursive-include splash/vendor/harviewer/webapp *.js *.html *.css *.gif *.png *.swf *.html
59 changes: 59 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ images : integer : optional
Note that cached images may be displayed even if this parameter is 0.
You can also use `Request Filters`_ to strip unwanted contents based on URL.

Examples
~~~~~~~~

Curl example::

Expand Down Expand Up @@ -154,6 +156,9 @@ height : integer : optional
Crop the renderd image to the given height (in pixels). Often used in
conjunction with the width argument to generate fixed-size thumbnails.

Examples
~~~~~~~~

Curl examples::

# render with timeout
Expand All @@ -163,6 +168,26 @@ Curl examples::
curl 'http://localhost:8050/render.png?url=http://domain.com/page-with-javascript.html&width=320&height=240'


render.har
----------

Return information about Splash interaction with a website in HAR_ format.
It includes information about requests made, responses received, timings,
headers, cookies, etc.

You can use online `HAR viewer`_ to visualize information returned from
this endpoint; it will be very similar to "Network" tabs in Firefox and Chrome
developer tools.

Currently this endpoint doesn't expose raw request and response contents;
only meta-information like headers and timings is available.

Arguments for this endpoint are the same as for `render.html`_.

.. _HAR: http://www.softwareishard.com/blog/har-12-spec/
.. _HAR viewer: http://www.softwareishard.com/har/viewer/


render.json
-----------

Expand Down Expand Up @@ -206,6 +231,30 @@ console : integer : optional
Whether to include the executed javascript console messages in output.
Possible values are ``1`` (include) and ``0`` (exclude). Default is 0.

.. _arg-history:

history : integer : optional
Whether to include the history of requests/responses for webpage main
frame. Possible values are ``1`` (include) and ``0`` (exclude).
Default is 0.

Use it to get HTTP status codes, cookies and headers.
Only information about "main" requests/responses is returned
(i.e. information about related resources like images and AJAX queries
is not returned). To get information about all requests and responses
use :ref:`'har' <arg-har>` argument.

.. _arg-har:

har : integer : optional
Whether to include HAR_ in output. Possible values are
``1`` (include) and ``0`` (exclude). Default is 0.
If this option is ON the result will contain the same data
as `render.har`_ provides under 'har' key.

Examples
~~~~~~~~

By default, URL, requested URL, page title and frame geometry is returned::

{
Expand Down Expand Up @@ -621,6 +670,16 @@ X-Splash-script : string
X-Splash-console : string
Same as :ref:`'console' <arg-console>` argument for `render.json`_.

X-Splash-history : string
Same as :ref:`'history' <arg-history>` argument for `render.json`_.

X-Splash-har : string
Same as :ref:`'har' <arg-har>` argument for `render.json`_.

.. note::

Proxying of HTTPS requests is not supported.

Curl examples::

# Display json stats
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ adblockparser >= 0.2
pyOpenSSL
requests >= 1.0
Pillow
jsonschema
strict-rfc3339
29 changes: 27 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@
import os
import re


def _path(*args):
return os.path.join(os.path.dirname(__file__), *args)


def get_version():
filename = os.path.join(os.path.dirname(__file__), 'splash', '__init__.py')
filename = _path('splash', '__init__.py')
with open(filename, 'r') as fp:
contents = fp.read().decode('utf8')
return re.search(r"__version__ = ['\"](.+)['\"]", contents).group(1)
Expand All @@ -12,7 +17,6 @@ def get_version():
setup_args = {
'name': 'splash',
'version': get_version(),
'packages': ['splash'],
'url': 'https://github.com/scrapinghub/splash',
'description': 'A javascript rendered with a HTTP API',
'long_description': open('README.rst').read(),
Expand All @@ -22,6 +26,26 @@ def get_version():
'maintainer_email': '[email protected]',
'license': 'BSD',
'scripts': ['bin/splash'],
'packages': ['splash', 'splash.har'],
'package_data': {'splash': [
'vendor/harviewer/webapp/css/*.css',
'vendor/harviewer/webapp/css/images/*.*',
'vendor/harviewer/webapp/css/images/menu/*.*',
'vendor/harviewer/webapp/scripts/*.*',
'vendor/harviewer/webapp/scripts/core/*.*',
'vendor/harviewer/webapp/scripts/domplate/*.*',
'vendor/harviewer/webapp/scripts/downloadify/js/*.*',
'vendor/harviewer/webapp/scripts/downloadify/src/*.*',
'vendor/harviewer/webapp/scripts/downloadify/media/*.*',
'vendor/harviewer/webapp/scripts/excanvas/*.*',
'vendor/harviewer/webapp/scripts/jquery-plugins/*.*',
'vendor/harviewer/webapp/scripts/json-query/*.*',
'vendor/harviewer/webapp/scripts/nls/*.*',
'vendor/harviewer/webapp/scripts/preview/*.*',
'vendor/harviewer/webapp/scripts/syntax-highlighter/*.js',
'vendor/harviewer/webapp/scripts/tabs/*.*',
'vendor/harviewer/webapp/har.js',
]},
'classifiers': [
'Programming Language :: Python',
'Programming Language :: Python :: 2',
Expand All @@ -41,6 +65,7 @@ def get_version():
except ImportError:
from distutils.core import setup
else:
setup_args['zip_safe'] = False
setup_args['install_requires'] = [
'Twisted', 'qt4reactor', 'psutil', 'adblockparser'
]
Expand Down
2 changes: 2 additions & 0 deletions splash/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
DO_PNG = 0
SHOW_SCRIPT = 0
SHOW_CONSOLE = 0
SHOW_HISTORY = 0
SHOW_HAR = 0

# servers
SPLASH_PORT = 8050
Expand Down
4 changes: 4 additions & 0 deletions splash/har/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

from .utils import get_duration, format_datetime
148 changes: 148 additions & 0 deletions splash/har/log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from collections import namedtuple
from datetime import datetime

import splash
from PyQt4.QtCore import PYQT_VERSION_STR, QT_VERSION_STR
from PyQt4.QtWebKit import qWebKitVersion

from .utils import get_duration, format_datetime, without_private


HarEvent = namedtuple('HarEvent', 'type data')

HAR_ENTRY = 'entry'
HAR_TIMING = 'timing'
HAR_URL_CHANGED = 'urlChanged'
HAR_TITLE_CHANGED = 'titleChanged'


class HarLog(object):
"""
Helper class for building HAR data.
"""

def __init__(self):
self.created_at = datetime.utcnow()
self.network_entries_map = {} # key => network entry map
self.events = [] # all entries in order, including the events
self.pages = None

def get_mutable_entry(self, req_id, create=False):
"""
Return a dict with HAR entry data. The dict is not a copy;
caller can modify the result and the changes will be kept.
"""
if create:
assert req_id not in self.network_entries_map
entry = {"_idx": req_id}
self.network_entries_map[req_id] = entry
self.events.append(HarEvent(HAR_ENTRY, entry))
return self.network_entries_map[req_id]

def store_url(self, url):
""" Call this method when URL is changed. """
self.events.append(HarEvent(HAR_URL_CHANGED, unicode(url)))

def store_title(self, title):
""" Call this method when page title is changed. """
self.events.append(HarEvent(HAR_TITLE_CHANGED, unicode(title)))

def store_timing(self, name):
"""
Call this method when an event you want to store timing for happened.
"""
self.events.append(
HarEvent(HAR_TIMING, {"name": name, "time": datetime.utcnow()})
)

def todict(self):
""" Return HAR log as a Python dict. """

# import pprint
# pprint.pprint(self.events)

self._fill_pages()

return {
"log": {
"version" : "1.2",
"creator" : {
"name": "Splash",
"version": splash.__version__,
},
"browser": self._get_browser(),
"entries": self._get_har_entries(),
"pages": self.pages,
}
}

def _get_browser(self):
return {
"name": "QWebKit",
"version": unicode(qWebKitVersion()),
"comment": "PyQt %s, Qt %s" % (PYQT_VERSION_STR, QT_VERSION_STR),
}

def _empty_page(self, page_id, started_dt):
if not isinstance(started_dt, basestring):
started_dt = format_datetime(started_dt)

return {
"id": str(page_id),
"title": "[no title]",
"startedDateTime": started_dt,
"pageTimings": {
"onContentLoad": -1,
"onLoad": -1,
}
}

def _fill_pages(self):
page_id = 1
started_dt = self.created_at
current_page = self._empty_page(page_id, started_dt)
first_page = True

self.pages = [current_page]

for idx, ev in enumerate(self.events):
if ev.type == HAR_TIMING:
name = ev.data["name"]
time = get_duration(started_dt, ev.data["time"])
current_page["pageTimings"][name] = time

elif ev.type == HAR_TITLE_CHANGED:
current_page["title"] = ev.data

elif ev.type == HAR_ENTRY:
ev.data["pageref"] = str(page_id)

elif ev.type == HAR_URL_CHANGED:
# We need to find a network entry which caused URL
# to change - it belongs to this new page.
cause_ev = self._prev_entry(ev.data, idx)
if first_page:
first_page = False
else:
# Start a new page.
page_id += 1
started_dt = cause_ev.data['_tmp']['start_time']
current_page = self._empty_page(page_id, started_dt)
self.pages.append(current_page)
cause_ev.data["pageref"] = str(page_id)

def _prev_entry(self, url, last_idx):
for ev in reversed(self.events[:last_idx]):
if ev.type != HAR_ENTRY:
continue
if ev.data["request"]["url"] == url:
return ev

def _get_har_entries(self):
return [
without_private(e.data)
for e in self.events
if e.type == HAR_ENTRY
]
Loading

0 comments on commit 3135d77

Please sign in to comment.