diff --git a/docs/api.rst b/docs/api.rst index 263f24d56..4b0e5da4d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -355,14 +355,26 @@ You can use online `HAR viewer`_ to visualize information returned from this endpoint; it will be very similar to "Network" tabs in Firefox and Chrome developer tools. -Currently this endpoint doesn't expose raw request contents; -only meta-information like headers and timings is available. -Response contents is included when -:ref:`'response_body' ` option is set to 1. +Request and response contents are included when +:ref:`'request_body' ` and +:ref:`'response_body' ` options are set to 1, +respectively. + +Due to the HAR_ format specification lacking a method of encoding binary +request data, a non-standard ``encoding`` field is included in ``postData``, +which, similarly to the field of same name in HAR responses, has the value +``base64`` when the request body has been encoded as such. Arguments for this endpoint are the same as for `render.html`_, plus the following: +.. _arg-request-body-0: + +request_body : int : optional + Possible values are ``1`` and ``0``. When ``request_body=1``, + request content is included in HAR records. Default is ``request_body=0``. + + .. _arg-response-body-0: response_body : int : optional @@ -446,8 +458,17 @@ har : integer : optional If this option is ON the result will contain the same data as `render.har`_ provides under 'har' key. - By default, response content is not included. To enable it use - :ref:`'response_body' ` option. + By default, request and response contents are not included. To enable each, + use :ref:`'request_body' ` and + :ref:`'response_body' ` options respectively. + +.. _arg-request-body: + +request_body : int : optional + Possible values are ``1`` and ``0``. When ``request_body=1``, + request content is included in HAR records. Default is + ``request_body=0``. This option has no effect when + both :ref:`'har' ` and :ref:`'history' ` are 0. .. _arg-response-body: diff --git a/docs/scripting-ref.rst b/docs/scripting-ref.rst index 224707a2c..9c0c28b0c 100644 --- a/docs/scripting-ref.rst +++ b/docs/scripting-ref.rst @@ -149,6 +149,24 @@ Enable or disable browser plugins (e.g. Flash). Plugins are disabled by default. +.. _splash-request-body-enabled: + +splash.request_body_enabled +---------------------------- + +Enable or disable storage of request content. + +**Signature:** ``splash.request_body_enabled = true/false`` + +By default Splash doesn't keep bodies of each request in memory. It means that +request content is not available in :ref:`splash-request-info` and in HAR_ +exports. To make request content available to a Lua script set +``splash.request_body_enabled = true``. + +Note that request body in :ref:`splash-request-info` is not available in the +callback :ref:`splash-on-response-headers` or in the request of the response +returned by :ref:`splash-http-get` and :ref:`splash-http-post`. + .. _splash-response-body-enabled: splash.response_body_enabled @@ -1453,12 +1471,14 @@ all existing logs and start recording from scratch: return {har1=har1, har2=har2} end -By default, response content is not returned in HAR data. To enable it, use -:ref:`splash-response-body-enabled` option or +By default, request and response contents are not included in HAR data. To +enable request contents, use :ref:`splash-request-body-enabled` option. To +enable response contents, use :ref:`splash-response-body-enabled` option or :ref:`splash-request-enable-response-body` method. See also: :ref:`splash-har-reset`, :ref:`splash-on-response`, -:ref:`splash-response-body-enabled`, :ref:`splash-request-enable-response-body`. +:ref:`splash-request-body-enabled`, :ref:`splash-response-body-enabled`, +:ref:`splash-request-enable-response-body`. .. _HAR: http://www.softwareishard.com/blog/har-12-spec/ diff --git a/splash/browser_tab.py b/splash/browser_tab.py index 9842f495f..15e3cf37f 100644 --- a/splash/browser_tab.py +++ b/splash/browser_tab.py @@ -212,6 +212,7 @@ def _set_default_webpage_options(self, web_page): self.set_js_enabled(True) self.set_plugins_enabled(defaults.PLUGINS_ENABLED) + self.set_request_body_enabled(defaults.REQUEST_BODY_ENABLED) self.set_response_body_enabled(defaults.RESPONSE_BODY_ENABLED) self.set_indexeddb_enabled(defaults.INDEXEDDB_ENABLED) self.set_webgl_enabled(defaults.WEBGL_ENABLED) @@ -254,6 +255,12 @@ def set_custom_headers(self, headers): """ self.web_page.custom_headers = headers + def get_request_body_enabled(self): + return self.web_page.request_body_enabled + + def set_request_body_enabled(self, val): + self.web_page.request_body_enabled = val + def get_response_body_enabled(self): return self.web_page.response_body_enabled diff --git a/splash/defaults.py b/splash/defaults.py index 670c9815f..ddc2e1cf4 100644 --- a/splash/defaults.py +++ b/splash/defaults.py @@ -85,6 +85,9 @@ # plugins (e.g. flash) PLUGINS_ENABLED = False +# request content +REQUEST_BODY_ENABLED = False + # response content RESPONSE_BODY_ENABLED = False diff --git a/splash/har/qt.py b/splash/har/qt.py index 9dae3cac3..1edc23d53 100644 --- a/splash/har/qt.py +++ b/splash/har/qt.py @@ -134,13 +134,39 @@ def reply2har(reply, content=None): res["content"]["size"] = len(content) res["content"]["text"] = base64.b64encode(content).decode('latin1') res["content"]["encoding"] = 'base64' - + return res -def request2har(request, operation, outgoing_data=None): +def _har_postdata(body, content_type): + """ + + Build the postData value for HAR, from a binary body and a content type. + + """ + + postdata = {"mimeType": content_type or "?"} + + if content_type == "application/x-www-form-urlencoded": + # application/x-www-form-urlencoded is valid ASCII, see + # . + try: + postdata["text"] = body.decode('ascii') + except UnicodeDecodeError: + pass + + # This is non-standard. The HAR format does not specify how to handle + # binary request data. + if "text" not in postdata: + postdata["encoding"] = "base64" + postdata["text"] = base64.b64encode(body).decode('ascii') + + return postdata + + +def request2har(request, operation, content=None): """ Serialize QNetworkRequest to HAR. """ - return { + har = { "method": OPERATION_NAMES.get(operation, '?'), "url": str(request.url().toString()), "httpVersion": "HTTP/1.1", @@ -148,5 +174,14 @@ def request2har(request, operation, outgoing_data=None): "queryString": querystring2har(request.url()), "headers": headers2har(request), "headersSize": headers_size(request), - "bodySize": outgoing_data.size() if outgoing_data is not None else -1, + "bodySize": -1 } + if content is not None: + har["bodySize"] = len(content) + content_type = request.header(QNetworkRequest.ContentTypeHeader) + har["postData"] = _har_postdata(content, content_type) + else: + content_length = request.header(QNetworkRequest.ContentLengthHeader) + if content_length is not None: + har["bodySize"] = content_length + return har diff --git a/splash/har_builder.py b/splash/har_builder.py index c9a2e469b..ee1b34645 100644 --- a/splash/har_builder.py +++ b/splash/har_builder.py @@ -52,7 +52,7 @@ def get_entry(self, req_id): entry = self.log.get_mutable_entry(req_id) return entry - def _initial_entry_data(self, start_time, operation, request, outgoingData): + def _initial_entry_data(self, start_time, operation, request, content): """ Return initial values for a new HAR entry. """ @@ -62,14 +62,13 @@ def _initial_entry_data(self, start_time, operation, request, outgoingData): 'start_time': start_time, 'request_start_sending_time': start_time, 'request_sent_time': start_time, - 'response_start_time': start_time, - # 'outgoingData': outgoingData, + 'response_start_time': start_time }, '_splash_processing_state': self.REQUEST_CREATED, # standard fields "startedDateTime": format_datetime(start_time), - "request": request2har(request, operation, outgoingData), + "request": request2har(request, operation, content), "response": { "bodySize": -1, }, @@ -98,7 +97,8 @@ def store_url(self, url): def store_timing(self, name): self.log.store_timing(name) - def store_new_request(self, req_id, start_time, operation, request, outgoingData): + def store_new_request(self, req_id, start_time, operation, request, + content): """ Store information about a new QNetworkRequest. """ @@ -107,7 +107,7 @@ def store_new_request(self, req_id, start_time, operation, request, outgoingData start_time=start_time, operation=operation, request=request, - outgoingData=outgoingData + content=content )) def store_new_reply(self, req_id, reply): diff --git a/splash/kernel/inspections/splash-auto.json b/splash/kernel/inspections/splash-auto.json index 6c3cc0227..a3438210b 100644 --- a/splash/kernel/inspections/splash-auto.json +++ b/splash/kernel/inspections/splash-auto.json @@ -65,6 +65,17 @@ "details": "Plugins are disabled by default.", "params": null }, + "splash.request_body_enabled": { + "name": null, + "header": "splash.request_body_enabled", + "content": "Enable or disable storage of request content.\n\n**Signature:** ``splash.request_body_enabled = true/false``\n\nBy default Splash doesn't keep bodies of each request in memory. It means that\nrequest content is not available in :ref:`splash-request-info` and in HAR_\nexports. To make request content available to a Lua script set\n``splash.request_body_enabled = true``.\n\nNote that request data in :ref:`splash-request-info` is not available in the\ncallback :ref:`splash-on-response-headers` or in the request of the response\nreturned by :ref:`splash-http-get` and :ref:`splash-http-post`.", + "short": "Enable or disable storage of request content.", + "signature": "splash.request_body_enabled = true/false", + "returns": null, + "async": null, + "details": "By default Splash doesn't keep bodies of each request in memory. It means that\nrequest content is not available in :ref:`splash-request-info` and in HAR_\nexports. To make request content available to a Lua script set\n``splash.request_body_enabled = true``.\n\nNote that request data in :ref:`splash-request-info` is not available in the\ncallback :ref:`splash-on-response-headers` or in the request of the response\nreturned by :ref:`splash-http-get` and :ref:`splash-http-post`.", + "params": null + }, "splash.response_body_enabled": { "name": null, "header": "splash.response_body_enabled", @@ -299,12 +310,12 @@ "splash:har": { "name": "har", "header": "splash:har", - "content": "**Signature:** ``har = splash:har{reset=false}``\n\n**Parameters:**\n\n* reset - optional; when ``true``, reset HAR records after taking a snapshot.\n\n**Returns:** information about pages loaded, events happened,\nnetwork requests sent and responses received in HAR_ format.\n\n**Async:** no.\n\nUse :ref:`splash-har` to get information about network requests and\nother Splash activity.\n\nIf your script returns the result of ``splash:har()`` in a top-level\n``\"har\"`` key then Splash UI will give you a nice diagram with network\ninformation (similar to \"Network\" tabs in Firefox or Chrome developer tools):\n\n.. code-block:: lua\n\n function main(splash)\n assert(splash:go(splash.args.url))\n return {har=splash:har()}\n end\n\nBy default, when several requests are made (e.g. :ref:`splash-go` is called\nmultiple times), HAR data is accumulated and combined into a single object\n(logs are still grouped by page).\n\nIf you want only updated information use ``reset`` parameter: it drops\nall existing logs and start recording from scratch:\n\n.. code-block:: lua\n\n function main(splash, args)\n assert(splash:go(args.url1))\n local har1 = splash:har{reset=true}\n assert(splash:go(args.url2))\n local har2 = splash:har()\n return {har1=har1, har2=har2}\n end\n\nBy default, response content is not returned in HAR data. To enable it, use\n:ref:`splash-response-body-enabled` option or\n:ref:`splash-request-enable-response-body` method.\n\nSee also: :ref:`splash-har-reset`, :ref:`splash-on-response`,\n:ref:`splash-response-body-enabled`, :ref:`splash-request-enable-response-body`.\n\n.. _HAR: http://www.softwareishard.com/blog/har-12-spec/", + "content": "**Signature:** ``har = splash:har{reset=false}``\n\n**Parameters:**\n\n* reset - optional; when ``true``, reset HAR records after taking a snapshot.\n\n**Returns:** information about pages loaded, events happened,\nnetwork requests sent and responses received in HAR_ format.\n\n**Async:** no.\n\nUse :ref:`splash-har` to get information about network requests and\nother Splash activity.\n\nIf your script returns the result of ``splash:har()`` in a top-level\n``\"har\"`` key then Splash UI will give you a nice diagram with network\ninformation (similar to \"Network\" tabs in Firefox or Chrome developer tools):\n\n.. code-block:: lua\n\n function main(splash)\n assert(splash:go(splash.args.url))\n return {har=splash:har()}\n end\n\nBy default, when several requests are made (e.g. :ref:`splash-go` is called\nmultiple times), HAR data is accumulated and combined into a single object\n(logs are still grouped by page).\n\nIf you want only updated information use ``reset`` parameter: it drops\nall existing logs and start recording from scratch:\n\n.. code-block:: lua\n\n function main(splash, args)\n assert(splash:go(args.url1))\n local har1 = splash:har{reset=true}\n assert(splash:go(args.url2))\n local har2 = splash:har()\n return {har1=har1, har2=har2}\n end\n\nBy default, request and response contents are not included in HAR data. To\nenable request contents, use :ref:`splash-request-body-enabled` option. To\nenable response contents, use :ref:`splash-response-body-enabled` option or\n:ref:`splash-request-enable-response-body` method.\n\nSee also: :ref:`splash-har-reset`, :ref:`splash-on-response`,\n:ref:`splash-request-body-enabled`, :ref:`splash-response-body-enabled`,\n:ref:`splash-request-enable-response-body`.\n\n.. _HAR: http://www.softwareishard.com/blog/har-12-spec/", "short": "", "signature": "har = splash:har{reset=false}", "returns": "information about pages loaded, events happened,\nnetwork requests sent and responses received in HAR_ format.", "async": "no.", - "details": "Use :ref:`splash-har` to get information about network requests and\nother Splash activity.\n\nIf your script returns the result of ``splash:har()`` in a top-level\n``\"har\"`` key then Splash UI will give you a nice diagram with network\ninformation (similar to \"Network\" tabs in Firefox or Chrome developer tools):\n\n.. code-block:: lua\n\n function main(splash)\n assert(splash:go(splash.args.url))\n return {har=splash:har()}\n end\n\nBy default, when several requests are made (e.g. :ref:`splash-go` is called\nmultiple times), HAR data is accumulated and combined into a single object\n(logs are still grouped by page).\n\nIf you want only updated information use ``reset`` parameter: it drops\nall existing logs and start recording from scratch:\n\n.. code-block:: lua\n\n function main(splash, args)\n assert(splash:go(args.url1))\n local har1 = splash:har{reset=true}\n assert(splash:go(args.url2))\n local har2 = splash:har()\n return {har1=har1, har2=har2}\n end\n\nBy default, response content is not returned in HAR data. To enable it, use\n:ref:`splash-response-body-enabled` option or\n:ref:`splash-request-enable-response-body` method.\n\nSee also: :ref:`splash-har-reset`, :ref:`splash-on-response`,\n:ref:`splash-response-body-enabled`, :ref:`splash-request-enable-response-body`.\n\n.. _HAR: http://www.softwareishard.com/blog/har-12-spec/", + "details": "Use :ref:`splash-har` to get information about network requests and\nother Splash activity.\n\nIf your script returns the result of ``splash:har()`` in a top-level\n``\"har\"`` key then Splash UI will give you a nice diagram with network\ninformation (similar to \"Network\" tabs in Firefox or Chrome developer tools):\n\n.. code-block:: lua\n\n function main(splash)\n assert(splash:go(splash.args.url))\n return {har=splash:har()}\n end\n\nBy default, when several requests are made (e.g. :ref:`splash-go` is called\nmultiple times), HAR data is accumulated and combined into a single object\n(logs are still grouped by page).\n\nIf you want only updated information use ``reset`` parameter: it drops\nall existing logs and start recording from scratch:\n\n.. code-block:: lua\n\n function main(splash, args)\n assert(splash:go(args.url1))\n local har1 = splash:har{reset=true}\n assert(splash:go(args.url2))\n local har2 = splash:har()\n return {har1=har1, har2=har2}\n end\n\nBy default, request and response contents are not included in HAR data. To\nenable request contents, use :ref:`splash-request-body-enabled` option. To\nenable response contents, use :ref:`splash-response-body-enabled` option or\n:ref:`splash-request-enable-response-body` method.\n\nSee also: :ref:`splash-har-reset`, :ref:`splash-on-response`,\n:ref:`splash-request-body-enabled`, :ref:`splash-response-body-enabled`,\n:ref:`splash-request-enable-response-body`.\n\n.. _HAR: http://www.softwareishard.com/blog/har-12-spec/", "params": "* reset - optional; when ``true``, reset HAR records after taking a snapshot." }, "splash:har_reset": { diff --git a/splash/network_manager.py b/splash/network_manager.py index 7c7929bd7..29108a505 100644 --- a/splash/network_manager.py +++ b/splash/network_manager.py @@ -5,7 +5,7 @@ from datetime import datetime import traceback -from PyQt5.QtCore import QByteArray, QTimer +from PyQt5.QtCore import QByteArray, QTimer from PyQt5.QtNetwork import ( QNetworkAccessManager, QNetworkProxyQuery, @@ -22,7 +22,7 @@ RequestLoggingMiddleware, AdblockRulesRegistry, ResourceTimeoutMiddleware, - ResponseBodyTrackingMiddleware, + RequestResponseBodyTrackingMiddleware, ) from splash.response_middleware import ContentTypeMiddleware from splash import defaults @@ -52,7 +52,7 @@ def __init__(self, filters_path=None, verbosity=None, allowed_schemes=None, disa self.request_middlewares.append(AllowedDomainsMiddleware(verbosity=verbosity)) self.request_middlewares.append(ResourceTimeoutMiddleware()) - self.request_middlewares.append(ResponseBodyTrackingMiddleware()) + self.request_middlewares.append(RequestResponseBodyTrackingMiddleware()) if filters_path is not None: self.adblock_rules = AdblockRulesRegistry(filters_path, verbosity=verbosity) @@ -84,7 +84,7 @@ class ProxiedQNetworkAccessManager(QNetworkAccessManager): * Provides a way to get the "source" request (that was made to Splash itself). * Tracks information about requests/responses and stores it in HAR format, - including response content. + including request and response content. * Allows to set per-request timeouts. """ _REQUEST_ID = QNetworkRequest.User + 1 @@ -123,6 +123,40 @@ def createRequest(self, operation, request, outgoingData=None): self.log(traceback.format_exc(), min_level=1, format_msg=False) return super(ProxiedQNetworkAccessManager, self).createRequest(operation, request, outgoingData) + def _get_request_body(self, request, outgoing_data): + if outgoing_data is None: + return + + size = None + if outgoing_data.isSequential(): + # In a sequential QIODevice, size() returns the value of + # bytesAvailable(), which is only the size of the data in the + # QIODevice buffer and not the total size of the output. Until + # a read attempt, the buffer is empty and size() returns zero. + # + # In requests generated by Qt WebKit, outgoing_data is a + # WebCore::FormDataIODevice object, which is sequential. Its + # getFormDataSize() method cannot be accessed through PyQt5, + # but Qt WebKit puts its value in the Content-Length header + # (see WebCore::QNetworkReplyHandler::getIODevice). + header = request.header(QNetworkRequest.ContentLengthHeader) + if isinstance(header, int): + size = header + else: + self.log(("non-integer QNetworkRequest Content-Length " + "header value %r") % (size,), + min_level=1) + else: + size = outgoing_data.size() + + if size is not None: + body = bytes(outgoing_data.peek(size)) + if len(body) != size: + self.log(("request body of size %s but Content-Length " + "header value %s") % (len(body), size), + min_level=1) + return body + def _createRequest(self, operation, request, outgoingData=None): """ This method is called when a new request is sent; @@ -136,11 +170,17 @@ def _createRequest(self, operation, request, outgoingData=None): self._clear_proxy() request, req_id = self._wrap_request(request) + + if getattr(request, 'track_request_body', False): + content = self._get_request_body(request, outgoingData) + else: + content = None + self._handle_custom_headers(request) self._handle_request_cookies(request) self._run_webpage_callbacks(request, 'on_request', - request, operation, outgoingData) + request, operation, content) self._handle_custom_proxies(request) self._handle_request_response_tracking(request) @@ -152,7 +192,7 @@ def _createRequest(self, operation, request, outgoingData=None): start_time=start_time, operation=operation, request=request, - outgoingData=outgoingData, + content=content ) reply = super(ProxiedQNetworkAccessManager, self).createRequest( @@ -219,7 +259,7 @@ def _wrap_request(self, request): req.setAttribute(QNetworkRequest.CacheLoadControlAttribute, QNetworkRequest.AlwaysNetwork) req.setAttribute(QNetworkRequest.CacheSaveControlAttribute, False) - for attr in ['timeout', 'track_response_body']: + for attr in ['timeout', 'track_request_body', 'track_response_body']: if hasattr(request, attr): setattr(req, attr, getattr(request, attr)) return req, req_id @@ -313,7 +353,7 @@ def _set_webpage_attribute(self, request, attribute, value): def _on_reply_error(self, error_id): self._response_bodies.pop(self._get_request_id(), None) - + if error_id != QNetworkReply.OperationCanceledError: error_msg = REQUEST_ERRORS.get(error_id, 'unknown error') self.log('Download error %d: %s ({url})' % (error_id, error_msg), diff --git a/splash/qtrender.py b/splash/qtrender.py index b4ccd89ff..29fbab959 100644 --- a/splash/qtrender.py +++ b/splash/qtrender.py @@ -75,8 +75,8 @@ class DefaultRenderScript(RenderScript): def start(self, url, baseurl=None, wait=None, viewport=None, js_source=None, js_profile=None, images=None, console=False, headers=None, http_method='GET', body=None, - render_all=False, resource_timeout=None, response_body=False, - html5_media=False): + render_all=False, resource_timeout=None, request_body=False, + response_body=False, html5_media=False): self.url = url self.wait_time = defaults.WAIT_TIME if wait is None else wait @@ -95,6 +95,7 @@ def start(self, url, baseurl=None, wait=None, viewport=None, if self.viewport != 'full': self.tab.set_viewport(self.viewport) + self.tab.set_request_body_enabled(request_body) self.tab.set_response_body_enabled(response_body) self.tab.set_html5_media_enabled(html5_media) @@ -220,6 +221,7 @@ def start(self, **kwargs): self.include = {inc: kwargs.pop(inc) for inc in include_options} self.include['console'] = kwargs.get('console') if not self.include['har'] and not self.include['history']: + kwargs['request_body'] = False kwargs['response_body'] = False super(JsonRender, self).start(**kwargs) diff --git a/splash/qtrender_lua.py b/splash/qtrender_lua.py index a542ad81c..f2c425fba 100644 --- a/splash/qtrender_lua.py +++ b/splash/qtrender_lua.py @@ -530,6 +530,16 @@ def get_private_mode_enabled(self): def set_private_mode_enabled(self, value): self.tab.set_private_mode_enabled(bool(value)) + @lua_property('request_body_enabled') + @command() + def get_request_body_enabled(self): + return self.tab.get_request_body_enabled() + + @get_request_body_enabled.lua_setter + @command() + def set_request_body_enabled(self, value): + self.tab.set_request_body_enabled(bool(value)) + @lua_property('response_body_enabled') @command() def get_response_body_enabled(self): @@ -1240,12 +1250,12 @@ def _on_request(self, callback): """ Register a Lua callback to be called when a resource is requested. """ - def _callback(request, operation, outgoing_data): + def _callback(request, operation, content): if self.destroyed: return exceptions = StoredExceptions() # FIXME: exceptions are discarded req = _ExposedBoundRequest(self.lua, exceptions, request, operation, - outgoing_data) + content) with req.allowed(): callback(req) @@ -2058,11 +2068,11 @@ class _ExposedBoundRequest(BaseExposedObject): """ QNetworkRequest wrapper for Lua """ _attribute_whitelist = ['url', 'method', 'headers', 'info'] - def __init__(self, lua, exceptions, request, operation, outgoing_data): + def __init__(self, lua, exceptions, request, operation, content): super(_ExposedBoundRequest, self).__init__(lua, exceptions) self.request = request - har_request = request2har(request, operation, outgoing_data) + har_request = request2har(request, operation, content) self.url = self.lua.python2lua(har_request['url']) self.method = self.lua.python2lua(har_request['method']) # TODO: make info and headers attributes lazy diff --git a/splash/qwebpage.py b/splash/qwebpage.py index 78f80efac..4aed7c6dc 100644 --- a/splash/qwebpage.py +++ b/splash/qwebpage.py @@ -46,6 +46,7 @@ class SplashQWebPage(QWebPage): skip_custom_headers = False navigation_locked = False resource_timeout = 0 + request_body_enabled = False response_body_enabled = False def __init__(self, verbosity=0): diff --git a/splash/render_options.py b/splash/render_options.py index c0af48725..f85e30b47 100644 --- a/splash/render_options.py +++ b/splash/render_options.py @@ -136,6 +136,9 @@ def get_resource_timeout(self): def get_response_body(self): return self._get_bool("response_body", defaults.RESPONSE_BODY_ENABLED) + def get_request_body(self): + return self._get_bool("request_body", defaults.REQUEST_BODY_ENABLED) + def get_images(self): return self._get_bool("images", defaults.AUTOLOAD_IMAGES) diff --git a/splash/request_middleware.py b/splash/request_middleware.py index 0d2e5fefc..87cc2941c 100644 --- a/splash/request_middleware.py +++ b/splash/request_middleware.py @@ -87,17 +87,20 @@ def process(self, request, render_options, operation, data): return request -class ResponseBodyTrackingMiddleware(object): +class RequestResponseBodyTrackingMiddleware(object): """ - Request middleware which enables/disables response body tracking based on - ``response_body_enabled`` attribute of QWebPage. + Request middleware which enables/disables request and response body + tracking based on ``request_body_enabled`` and ``response_body_enabled`` + attributes of QWebPage. """ def process(self, request, render_options, operation, data): web_frame = get_request_webframe(request) if not web_frame: return request - track = getattr(web_frame.page(), 'response_body_enabled', False) - request.track_response_body = track + request.track_request_body = getattr(web_frame.page(), + 'request_body_enabled', False) + request.track_response_body = getattr(web_frame.page(), + 'response_body_enabled', False) return request diff --git a/splash/resources.py b/splash/resources.py index 2b5b42889..076c12951 100644 --- a/splash/resources.py +++ b/splash/resources.py @@ -319,6 +319,7 @@ def _get_render(self, request, options): params = options.get_common_params(self.js_profiles_path) params.update(options.get_jpeg_params()) params.update(options.get_include_params()) + params['request_body'] = options.get_request_body() params['response_body'] = options.get_response_body() return self.pool.render(JsonRender, options, **params) @@ -328,6 +329,7 @@ class RenderHarResource(BaseRenderResource): def _get_render(self, request, options): params = options.get_common_params(self.js_profiles_path) + params['request_body'] = options.get_request_body() params['response_body'] = options.get_response_body() return self.pool.render(HarRender, options, **params) @@ -437,6 +439,7 @@ def _validate_params(self, request): 'save_args': options.get_save_args(), 'load_args': options.get_load_args(), 'timeout': options.get_timeout(), + 'request_body': options.get_request_body(), 'response_body': options.get_response_body(), 'har': 1, 'png': 1, diff --git a/splash/tests/lua_modules/emulation.lua b/splash/tests/lua_modules/emulation.lua index 6e1894d1b..31f9aa168 100644 --- a/splash/tests/lua_modules/emulation.lua +++ b/splash/tests/lua_modules/emulation.lua @@ -40,6 +40,7 @@ function Splash:go_and_wait(args) -- set a resource timeout self.resource_timeout = args.resource_timeout self.response_body_enabled = args.response_body + self.request_body_enabled = args.request_body local ok, reason = self:go{url=url, baseurl=args.baseurl} if not ok then diff --git a/splash/tests/mockserver.py b/splash/tests/mockserver.py index ef2044062..e4a42b88d 100755 --- a/splash/tests/mockserver.py +++ b/splash/tests/mockserver.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import json import os import optparse import base64 @@ -467,7 +468,8 @@ def getChild(self, name, request):
- + +
@@ -476,6 +478,34 @@ def getChild(self, name, request): """) +class XHRPostPage(Resource): + isLeaf = True + + def render_GET(self, request): + content_type = getarg(request, "content_type", + "application/octet-stream") + body = getarg(request, "body", "Hello world!") + + # Used to test large requests. + body_repeat = int(getarg(request, "body_repeat", 1)) + body *= body_repeat + + res = """ + + + + + + """ % (json.dumps(content_type), json.dumps(body)) + + return res.encode('utf-8') + + ExternalIFrameResource = _html_resource(""" @@ -1071,6 +1101,8 @@ def __init__(self, http_port, https_port, proxy_port): self.putChild(b"meta-redirect-target", MetaRedirectTarget()) self.putChild(b"http-redirect", HttpRedirectResource()) + self.putChild(b"do-post", XHRPostPage()) + self.putChild(b"", Index(self.children)) self.putChild(b"gzip", GzipRoot(self.children)) diff --git a/splash/tests/test_har.py b/splash/tests/test_har.py index 2b98345e3..6be73a2c4 100644 --- a/splash/tests/test_har.py +++ b/splash/tests/test_har.py @@ -179,6 +179,66 @@ def test_redirect_chain_nowait(self): (self.mockurl('jsredirect-chain'), 200), ]) + def test_request_body(self): + url = self.mockurl('jspost') + data = self.assertValidHar(url, wait=0.1) + entries = data['log']['entries'] + assert len(entries) == 2 + for entry in entries: + assert 'postData' not in entry['request'] + + data = self.assertValidHar(url, wait=0.1, request_body=1) + entries = data['log']['entries'] + assert len(entries) == 2 + post_data = entries[1]['request']['postData'] + assert 'encoding' not in post_data + assert post_data['mimeType'] == "application/x-www-form-urlencoded" + assert post_data['text'] == ("hidden-field=i-am-hidden&" + "a-field=field+value") + + def test_request_body_binary(self): + url = self.mockurl('do-post?content_type=application%2Foctet-stream&' + 'body=Hello+world%21') + data = self.assertValidHar(url, wait=0.1, request_body=1) + entries = data['log']['entries'] + assert len(entries) == 2 + post_data = entries[1]['request']['postData'] + assert post_data['encoding'] == 'base64' + assert post_data['mimeType'] == "application/octet-stream" + assert base64.b64decode(post_data['text']) == b"Hello world!" + + def test_request_body_non_ascii_urlencoded(self): + # "á" must not be encoded in this URL because it will also not be + # encoded in the URL from HAR data (as the default behaviour of + # QUrl.toString()) and assertValidHar() will compare them. + url = self.mockurl('do-post?' + 'content_type=application%2Fx-www-form-urlencoded&' + 'body=á') + data = self.assertValidHar(url, wait=0.1, request_body=1) + entries = data['log']['entries'] + assert len(entries) == 2 + post_data = entries[1]['request']['postData'] + assert post_data['encoding'] == 'base64' + assert post_data['mimeType'] == "application/x-www-form-urlencoded" + assert base64.b64decode(post_data['text']) == "á".encode('utf-8') + + def test_large_request_body(self): + # Test with something larger than the QIODevice buffer, which is 16 KiB + # (see QIODEVICE_BUFFERSIZE in qiodevice_p.h). + KiB = 2 ** 10 + body_size = 100 * KiB + url = self.mockurl(('do-post?' + 'content_type=application%2Foctet-stream&' + 'body=A&' + 'body_repeat={}').format(body_size)) + data = self.assertValidHar(url, wait=0.1, request_body=1) + entries = data['log']['entries'] + assert len(entries) == 2 + post_data = entries[1]['request']['postData'] + assert post_data['encoding'] == 'base64' + assert post_data['mimeType'] == "application/octet-stream" + assert base64.b64decode(post_data['text']) == b'A' * body_size + def test_response_body(self): url = self.mockurl('show-image') data = self.assertValidHar(url) diff --git a/splash/tests/test_render.py b/splash/tests/test_render.py index c528d796f..d42bbe723 100644 --- a/splash/tests/test_render.py +++ b/splash/tests/test_render.py @@ -818,6 +818,17 @@ def test_history_status_codes(self): url = self.mockurl('getrequest') + '?code=%d' % code self.assertHistoryUrls({'url': url}, [(url, code)], full_urls=True) + def test_history_request_body(self): + history = self.assertHistoryUrls( + {'url': self.mockurl('jspost'), 'wait': 0.1, 'request_body': 1}, + [('jspost', 200), ('postrequest', 200)] + ) + post_data = history[1]['request']['postData'] + assert 'encoding' not in post_data + assert post_data['mimeType'] == "application/x-www-form-urlencoded" + assert post_data['text'] == ("hidden-field=i-am-hidden&" + "a-field=field+value") + def assertHistoryUrls(self, query, urls_and_codes, full_urls=False): query['history'] = 1 resp = self.request(query) diff --git a/splash/tests/test_request_body_lua.py b/splash/tests/test_request_body_lua.py new file mode 100644 index 000000000..e06689711 --- /dev/null +++ b/splash/tests/test_request_body_lua.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +from splash.tests.test_execute import BaseLuaRenderTest + + +class RequestBodyLuaTest(BaseLuaRenderTest): + def test_request_body_enabled(self): + url = self.mockurl('jspost') + resp = self.request_lua(""" + treat = require('treat') + function main(splash) + splash.request_body_enabled = true + assert(splash:go(splash.args.url)) + splash:wait(0.1) + local har1 = splash:har{reset=true} + local enabled1 = splash.request_body_enabled + splash.request_body_enabled = false + assert(splash:go(splash.args.url)) + splash:wait(0.1) + local har2 = splash:har() + local enabled2 = splash.request_body_enabled + return { + har = treat.as_array({har1, har2}), + enabled1 = enabled1, + enabled2 = enabled2 + } + end + """, {'url': url}) + self.assertStatusCode(resp, 200) + data = resp.json() + + assert data['enabled1'] + assert not data['enabled2'] + + har1 = data['har'][0]['log']['entries'] + assert 'postData' in har1[1]['request'] + + har2 = data['har'][1]['log']['entries'] + assert 'postData' not in har2[1]['request'] + + def test_request_info_on_request_postdata(self): + url = self.mockurl('jspost') + resp = self.request_lua(""" + function main(splash) + splash.request_body_enabled = true + + local request_info = nil + + splash:on_request(function(request) + if request.method == "POST" then + request_info = request.info + end + end) + + assert(splash:go(splash.args.url)) + splash:wait(0.1) + + local post_data = request_info["postData"] + return { + text = post_data["text"], + mime_type = post_data["mimeType"] + } + end + """, {'url': url}) + self.assertStatusCode(resp, 200) + data = resp.json() + + assert data['text'] == "hidden-field=i-am-hidden&a-field=field+value" + assert data['mime_type'] == "application/x-www-form-urlencoded" + + def test_request_info_on_response_postdata(self): + url = self.mockurl('jspost') + resp = self.request_lua(""" + function main(splash) + splash.request_body_enabled = true + + local request_info = nil + + splash:on_response(function(response) + if response.request.method == "POST" then + request_info = response.request.info + end + end) + + assert(splash:go(splash.args.url)) + splash:wait(0.1) + + local post_data = request_info["postData"] + return { + text = post_data["text"], + mime_type = post_data["mimeType"] + } + end + """, {'url': url}) + self.assertStatusCode(resp, 200) + data = resp.json() + + assert data['text'] == "hidden-field=i-am-hidden&a-field=field+value" + assert data['mime_type'] == "application/x-www-form-urlencoded"