diff --git a/README.rst b/README.rst index 7dce08a62..71a1ad7df 100644 --- a/README.rst +++ b/README.rst @@ -11,520 +11,4 @@ twisted and QT webkit for rendering pages. The (twisted) QT reactor is used to make the sever fully asynchronous allowing to take advantage of webkit concurrency via QT main loop. -Requirements -============ - -See requirements.txt - - -Usage -===== - -To run the server:: - - python -m splash.server - -Run ``python -m splash.server --help`` to see options available. - -API -=== - -The following endpoints are supported: - -render.html ------------ - -Return the HTML of the javascript-rendered page. - -Arguments: - -url : string : required - The url to render (required) - -baseurl : string : optional - The base url to render the page with. - - If given, base HTML content will be feched from the URL given in the url - argument, and render using this as the base url. - -timeout : float : optional - A timeout (in seconds) for the render (defaults to 30) - -wait : float : optional - Time (in seconds) to wait for updates after page is loaded - (defaults to 0). Increase this value if you expect pages to contain - setInterval/setTimeout javascript calls, because with wait=0 - callbacks of setInterval/setTimeout won't be executed. Non-zero - 'wait' is also required for PNG rendering when viewport=full - (see later). - -proxy : string : optional - Proxy profile name. See :ref:`Proxy Profiles`. - -allowed_domains : string : optional - Comma-separated list of allowed domain names. - If present, Splash won't load anything neither from domains - not in this list nor from subdomains of domains not in this list. - -viewport : string : optional - View width and height (in pixels) of the browser viewport - to render the web page. Format is "x", e.g. 800x600. - It also accepts 'full' as value; viewport=full means that the whole - page (possibly very tall) will be rendered. Default value is 1024x768. - - 'viewport' parameter is more important for PNG rendering; - it is supported for all rendering endpoints because javascript - code execution can depend on viewport size. - -.. note:: - - viewport=full requires non-zero 'wait' parameter. This is - an unfortunate restriction, but it seems that this is the only - way to make rendering work reliably with viewport=full. - - -Curl example:: - - curl http://localhost:8050/render.html?url=http://domain.com/page-with-javascript.html&timeout=10&wait=0.5 - -The result is always encoded to utf-8. Always decode HTML data returned -by render.html endpoint from utf-8 even if there are tags like - -:: - - - -in the result. - -render.png ----------- - -Return a image (in PNG format) of the javascript-rendered page. - -Arguments: - -Same as `render.html`_ plus the following ones: - -width : integer : optional - Resize the rendered image to the given width (in pixels) keeping the aspect - ratio. - -height : integer : optional - Crop the renderd image to the given height (in pixels). Often used in - conjunction with the width argument to generate fixed-size thumbnails. - -Curl examples:: - - # render with timeout - curl http://localhost:8050/render.png?url=http://domain.com/page-with-javascript.html&timeout=10 - - # 320x240 thumbnail - curl http://localhost:8050/render.png?url=http://domain.com/page-with-javascript.html&width=320&height=240 - - -render.json ------------ - -Return a json-encoded dictionary with information about javascript-rendered -webpage. It can include HTML, PNG and other information, based on GET -arguments passed. - -Arguments: - -Same as `render.png`_ plus the following ones: - -html : integer : optional - Whether to include HTML in output. Possible values are - ``1`` (include) and ``0`` (exclude). Default is 0. - -png : integer : optional - Whether to include PNG in output. Possible values are - ``1`` (include) and ``0`` (exclude). Default is 0. - -iframes : integer : optional - Whether to include information about child frames in output. - Possible values are ``1`` (include) and ``0`` (exclude). - Default is 0. - -script : integer : optional - Whether to include the result of the executed javascript final - statement in output. Possible values are ``1`` (include) and ``0`` - (exclude). Default is 0. - -console : integer : optional - Whether to include the executed javascript console messages in output. - Possible values are ``1`` (include) and ``0`` (exclude). Default is 0. - -By default, URL, requested URL, page title and frame geometry is returned:: - - { - "url": "http://crawlera.com/", - "geometry": [0, 0, 640, 480], - "requestedUrl": "http://crawlera.com/", - "title": "Crawlera" - } - -Add 'html=1' to request to add HTML to the result:: - - { - "url": "http://crawlera.com/", - "geometry": [0, 0, 640, 480], - "requestedUrl": "http://crawlera.com/", - "html": "-->", - "requestedUrl": "http://www.youtube.com/embed/lSJvVqDLOOs?version=3&rel=1&fs=1&showsearch=0&showinfo=1&iv_load_policy=1&wmode=transparent", - "childFrames": [] - } - ], - "requestedUrl": "http://scrapinghub.com/autoscraping.html" - } - -Note that iframes can be nested. - -Pass both 'html=1' and 'iframes=1' to get HTML for all iframes -as well as for the main page:: - - { - "geometry": [0, 0, 640, 480], - "frameName": "", - "html": "...", - "geometry": [235, 502, 497, 310], - "frameName": "-->", - "requestedUrl": "http://www.youtube.com/embed/lSJvVqDLOOs?version=3&rel=1&fs=1&showsearch=0&showinfo=1&iv_load_policy=1&wmode=transparent", - "childFrames": [] - } - ], - "requestedUrl": "http://scrapinghub.com/autoscraping.html" - } - -Unlike 'html=1', 'png=1' does not affect data in childFrames. - -When executing JavaScript code add the parameter 'script=1' to the request -to include the code output in the result:: - - { - "url": "http://crawlera.com/", - "geometry": [0, 0, 640, 480], - "requestedUrl": "http://crawlera.com/", - "title": "Crawlera", - "script": "result of script..." - } - -The JavaScript code supports the console.log() function to log messages. -Add 'console=1' to the request to include the console output in the result:: - - { - "url": "http://crawlera.com/", - "geometry": [0, 0, 640, 480], - "requestedUrl": "http://crawlera.com/", - "title": "Crawlera", - "script": "result of script...", - "console": ["first log message", "second log message", ...] - } - - -Curl examples:: - - # full information - curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&png=1&html=1&iframes=1 - - # HTML and meta information of page itself and all its iframes - curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&html=1&iframes=1 - - # only meta information (like page/iframes titles and urls) - curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&iframes=1 - - # render html and 320x240 thumbnail at once; do not return info about iframes - curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&html=1&png=1&width=320&height=240 - - # Render page and execute simple Javascript function, display the js output - curl -X POST -H 'content-type: application/javascript' \ - -d 'function getAd(x){ return x; } getAd("abc");' \ - 'http://localhost:8050/render.json?url=http://domain.com&script=1' - - # Render page and execute simple Javascript function, display the js output and the console output - curl -X POST -H 'content-type: application/javascript' \ - -d 'function getAd(x){ return x; }; console.log("some log"); console.log("another log"); getAd("abc");' \ - 'http://localhost:8050/render.json?url=http://domain.com&script=1&console=1' - - -Executing custom Javascript code within page context -==================================================== - -Splash supports executing JavaScript code within the context of the page. -The JavaScript code is executed after the page finished loading (including -any delay defined by 'wait') but before the page is rendered. This allow to -use the javascript code to modify the page being rendered. - -To execute JavaScript code we use a POST request with the content-type set to -'application/javascript'. The body of the request should contain the code to -be executed. - -Curl example:: - - # Render page and modify its title dynamically - curl -X POST -H 'content-type: application/javascript' \ - -d 'document.title="My Title";' \ - 'http://localhost:8050/render.html?url=http://domain.com' - -To get the result of a javascript function executed within page -context use render.json endpoint with script=1 parameter. - -Javascript Profiles -------------------- - -Splash supports "javascript profiles" that allows to preload javascript files. -Javascript files defined in a profile are executed after the page is loaded -and before any javascript code defined in the request. - -The preloaded files can be used in the user's POST'ed code. - -To enable javascript profiles support, run splash server with the -``--js-profiles-path=`` option:: - - python -m splash.server --js-profiles-path=/etc/splash/js-profiles - -Then create a directory with the name of the profile and place inside it the -javascript files to load (note they must be utf-8 encoded). -The files are loaded in the order they appear in the filesystem. -Directory example:: - - /etc/splash/js-profiles/ - mywebsite/ - lib1.js - -To apply this javascript profile add the parameter -``js=mywebsite`` to the request:: - - curl -X POST -H 'content-type: application/javascript' \ - -d 'myfunc("Hello");' \ - 'http://localhost:8050/render.html?js=mywebsite&url=http://domain.com' - -Note that this example assumes that myfunc is a javascript function -defined in lib1.js. - -Javascript Security -------------------- - -If Splash is started with ``--js-cross-domain-access`` option - -:: - - python -m splash.server --js-cross-domain-access - -then javascript code is allowed to access the content of iframes -loaded from a security origin diferent to the original page (browsers usually -disallow that). This feature is useful for scraping, e.g. to extract the -html of a iframe page. An example of its usage: - - curl -X POST -H 'content-type: application/javascript' \ - -d 'function getContents(){ var f = document.getElementById("external"); return f.contentDocument.getElementsByTagName("body")[0].innerHTML; }; getContents();' \ - 'http://localhost:8050/render.html?url=http://domain.com' - -The javascript function 'getContents' will look for a iframe with -the id 'external' and extract its html contents. - -Note that allowing cross origin javascript calls is a potential -security issue, since it is possible that secret information (i.e cookies) -is exposed when this support is enabled; also, some websites don't load -when cross-domain security is disabled, so this feature is OFF by default. - - -Proxy Profiles -============== - -Splash supports "proxy profiles" that allows to set proxy handling rules -per-request using ``proxy`` GET parameter. - -To enable proxy profiles support, run splash server with -``--proxy-profiles-path=`` option:: - - python -m splash.server --proxy-profiles-path=/etc/splash/proxy-profiles - -Then create an INI file with "proxy profile" config inside the -specified folder, e.g. ``/etc/splash/proxy-profiles/mywebsite.ini``. -Example contents of this file:: - - [proxy] - - ; required - host=proxy.crawlera.com - port=8010 - - ; optional, default is no auth - username=username - password=password - - [rules] - ; optional, default ".*" - whitelist= - .*mywebsite\.com.* - - ; optional, default is no blacklist - blacklist= - .*\.js.* - .*\.css.* - .*\.png - -whitelist and blacklist are newline-separated lists of regexes. -If URL matches one of whitelist patterns and matches none of blacklist -patterns, proxy specified in ``[proxy]`` section is used; -no proxy is used otherwise. - -Then, to apply proxy rules according to this profile, -add ``proxy=mywebsite`` parameter to request:: - - curl http://localhost:8050/render.html?url=http://mywebsite.com/page-with-javascript.html&proxy=mywebsite - -If ``default.ini`` profile is present, it will be used when ``proxy`` -GET argument is not specified. If you have ``default.ini`` profile -but don't want to apply it pass ``none`` as ``proxy`` value. - - -Splash as a Proxy -================= - -Splash supports working as HTTP proxy. In this mode all the HTTP requests received -will be proxied and the response will be rendered based in the following HTTP headers: - -X-Splash-render : string : required - The render mode to use, valid modes are: html, png and json. These modes have - the same behavior as the endpoints: render.html, render.png and render.json respectively. - -X-Splash-js-source : string - Allow to execute javascript code same as POST js code to render.html - -X-Splash-timeout : string - Same as 'timeout' argument for render.html - -X-Splash-wait : string - Same as 'wait' argument for render.html - -X-Splash-proxy : string - Same as 'proxy' argument for render.html - -X-Splash-allowed-domains : string - Same as 'allowed_domains' argument for render.html - -X-Splash-viewport : string - Same as 'viewport' argument for render.html - -X-Splash-width : string - Same as 'width' argument for render.png - -X-Splash-height : string - Same as 'height' argument for render.png - -X-Splash-html : string - Same as 'html' argument for render.json - -X-Splash-png : string - Same as 'png' argument for render.json - -X-Splash-iframes : string - Same as 'iframes' argument for render.json - -X-Splash-script : string - Same as 'script' argument for render.json - -X-Splash-console : string - Same as 'console' argument for render.json - - -Splash proxy mode is enabled by default. To disable it run splash -server with ``--disable-proxy`` option:: - - python -m splash.server --disable-proxy - - -Curl examples:: - - # Display json stats - curl -x localhost:8051 -H 'X-Splash-render: json' \ - http://www.domain.com - - # Get the html page and screenshot - curl -x localhost:8051 \ - -H "X-Splash-render: json" \ - -H "X-Splash-html: 1" \ - -H "X-Splash-png: 1" \ - http://www.mywebsite.com - - # Execute JS and return output - curl -x localhost:8051 \ - -H 'X-Splash-render: json' \ - -H 'X-Splash-script: 1' \ - -H 'X-Splash-js-source: function test(x){ return x; } test("abc");' \ - http://www.domain.com - - # Send POST request to site and save screenshot of results - curl -X POST -d '{"key":"val"}' -x localhost:8051 -o screenshot.png \ - -H 'X-Splash-render: png' \ - http://www.domain.com - - -Functional Tests -================ - -Run with:: - - nosetests - - -Stress tests -============ - -There are some stress tests that spawn its own splash server and a mock server -to run tests against. - -To run the stress tests:: - - python -m splash.tests.stress - -Typical output:: - - $ python -m splash.tests.stress - Total requests: 1000 - Concurrency : 50 - Log file : /tmp/splash-stress-48H91h.log - ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ - Received/Expected (per status code or error): - 200: 500/500 - 504: 200/200 - 502: 300/300 - +See documentation in ``docs/index.rst`` diff --git a/bin/splash b/bin/splash new file mode 100755 index 000000000..108ae4ccf --- /dev/null +++ b/bin/splash @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +from splash.server import main +main() diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..0f74becf0 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,181 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build +PYTHON = python + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Splash.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Splash.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Splash" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Splash" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +htmlview: html + $(PYTHON) -c "import webbrowser; webbrowser.open('_build/html/index.html')" diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..bc37abd95 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- +# +# Splash documentation build configuration file, created by +# sphinx-quickstart on Fri Apr 25 10:45:59 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Splash' +copyright = u'2014, Scrapinghub' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.0' +# The full version, including alpha/beta/rc tags. +release = '1.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Splashdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'Splash.tex', u'Splash Documentation', + u'Scrapinghub', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'splash', u'Splash Documentation', + [u'Scrapinghub'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Splash', u'Splash Documentation', + u'Scrapinghub', 'Splash', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..cf81b706d --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,531 @@ +======================================= +Splash - A javascript rendering service +======================================= + +Introduction +============ + +Splash is a javascript rendering service with a HTTP API. It runs on top of +twisted and QT webkit for rendering pages. + +The (twisted) QT reactor is used to make the sever fully asynchronous allowing +to take advantage of webkit concurrency via QT main loop. + +Requirements +============ + +.. literalinclude:: ../requirements.txt + +Usage +===== + +To run the server:: + + python -m splash.server + +Run ``python -m splash.server --help`` to see options available. + +API +=== + +The following endpoints are supported: + +render.html +----------- + +Return the HTML of the javascript-rendered page. + +Arguments: + +url : string : required + The url to render (required) + +baseurl : string : optional + The base url to render the page with. + + If given, base HTML content will be feched from the URL given in the url + argument, and render using this as the base url. + +timeout : float : optional + A timeout (in seconds) for the render (defaults to 30) + +wait : float : optional + Time (in seconds) to wait for updates after page is loaded + (defaults to 0). Increase this value if you expect pages to contain + setInterval/setTimeout javascript calls, because with wait=0 + callbacks of setInterval/setTimeout won't be executed. Non-zero + 'wait' is also required for PNG rendering when viewport=full + (see later). + +proxy : string : optional + Proxy profile name. See :ref:`Proxy Profiles`. + +allowed_domains : string : optional + Comma-separated list of allowed domain names. + If present, Splash won't load anything neither from domains + not in this list nor from subdomains of domains not in this list. + +viewport : string : optional + View width and height (in pixels) of the browser viewport + to render the web page. Format is "x", e.g. 800x600. + It also accepts 'full' as value; viewport=full means that the whole + page (possibly very tall) will be rendered. Default value is 1024x768. + + 'viewport' parameter is more important for PNG rendering; + it is supported for all rendering endpoints because javascript + code execution can depend on viewport size. + +.. note:: + + viewport=full requires non-zero 'wait' parameter. This is + an unfortunate restriction, but it seems that this is the only + way to make rendering work reliably with viewport=full. + + +Curl example:: + + curl http://localhost:8050/render.html?url=http://domain.com/page-with-javascript.html&timeout=10&wait=0.5 + +The result is always encoded to utf-8. Always decode HTML data returned +by render.html endpoint from utf-8 even if there are tags like + +:: + + + +in the result. + +render.png +---------- + +Return a image (in PNG format) of the javascript-rendered page. + +Arguments: + +Same as `render.html`_ plus the following ones: + +width : integer : optional + Resize the rendered image to the given width (in pixels) keeping the aspect + ratio. + +height : integer : optional + Crop the renderd image to the given height (in pixels). Often used in + conjunction with the width argument to generate fixed-size thumbnails. + +Curl examples:: + + # render with timeout + curl http://localhost:8050/render.png?url=http://domain.com/page-with-javascript.html&timeout=10 + + # 320x240 thumbnail + curl http://localhost:8050/render.png?url=http://domain.com/page-with-javascript.html&width=320&height=240 + + +render.json +----------- + +Return a json-encoded dictionary with information about javascript-rendered +webpage. It can include HTML, PNG and other information, based on GET +arguments passed. + +Arguments: + +Same as `render.png`_ plus the following ones: + +html : integer : optional + Whether to include HTML in output. Possible values are + ``1`` (include) and ``0`` (exclude). Default is 0. + +png : integer : optional + Whether to include PNG in output. Possible values are + ``1`` (include) and ``0`` (exclude). Default is 0. + +iframes : integer : optional + Whether to include information about child frames in output. + Possible values are ``1`` (include) and ``0`` (exclude). + Default is 0. + +script : integer : optional + Whether to include the result of the executed javascript final + statement in output. Possible values are ``1`` (include) and ``0`` + (exclude). Default is 0. + +console : integer : optional + Whether to include the executed javascript console messages in output. + Possible values are ``1`` (include) and ``0`` (exclude). Default is 0. + +By default, URL, requested URL, page title and frame geometry is returned:: + + { + "url": "http://crawlera.com/", + "geometry": [0, 0, 640, 480], + "requestedUrl": "http://crawlera.com/", + "title": "Crawlera" + } + +Add 'html=1' to request to add HTML to the result:: + + { + "url": "http://crawlera.com/", + "geometry": [0, 0, 640, 480], + "requestedUrl": "http://crawlera.com/", + "html": "-->", + "requestedUrl": "http://www.youtube.com/embed/lSJvVqDLOOs?version=3&rel=1&fs=1&showsearch=0&showinfo=1&iv_load_policy=1&wmode=transparent", + "childFrames": [] + } + ], + "requestedUrl": "http://scrapinghub.com/autoscraping.html" + } + +Note that iframes can be nested. + +Pass both 'html=1' and 'iframes=1' to get HTML for all iframes +as well as for the main page:: + + { + "geometry": [0, 0, 640, 480], + "frameName": "", + "html": "...", + "geometry": [235, 502, 497, 310], + "frameName": "-->", + "requestedUrl": "http://www.youtube.com/embed/lSJvVqDLOOs?version=3&rel=1&fs=1&showsearch=0&showinfo=1&iv_load_policy=1&wmode=transparent", + "childFrames": [] + } + ], + "requestedUrl": "http://scrapinghub.com/autoscraping.html" + } + +Unlike 'html=1', 'png=1' does not affect data in childFrames. + +When executing JavaScript code add the parameter 'script=1' to the request +to include the code output in the result:: + + { + "url": "http://crawlera.com/", + "geometry": [0, 0, 640, 480], + "requestedUrl": "http://crawlera.com/", + "title": "Crawlera", + "script": "result of script..." + } + +The JavaScript code supports the console.log() function to log messages. +Add 'console=1' to the request to include the console output in the result:: + + { + "url": "http://crawlera.com/", + "geometry": [0, 0, 640, 480], + "requestedUrl": "http://crawlera.com/", + "title": "Crawlera", + "script": "result of script...", + "console": ["first log message", "second log message", ...] + } + + +Curl examples:: + + # full information + curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&png=1&html=1&iframes=1 + + # HTML and meta information of page itself and all its iframes + curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&html=1&iframes=1 + + # only meta information (like page/iframes titles and urls) + curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&iframes=1 + + # render html and 320x240 thumbnail at once; do not return info about iframes + curl http://localhost:8050/render.json?url=http://domain.com/page-with-iframes.html&html=1&png=1&width=320&height=240 + + # Render page and execute simple Javascript function, display the js output + curl -X POST -H 'content-type: application/javascript' \ + -d 'function getAd(x){ return x; } getAd("abc");' \ + 'http://localhost:8050/render.json?url=http://domain.com&script=1' + + # Render page and execute simple Javascript function, display the js output and the console output + curl -X POST -H 'content-type: application/javascript' \ + -d 'function getAd(x){ return x; }; console.log("some log"); console.log("another log"); getAd("abc");' \ + 'http://localhost:8050/render.json?url=http://domain.com&script=1&console=1' + + +Executing custom Javascript code within page context +==================================================== + +Splash supports executing JavaScript code within the context of the page. +The JavaScript code is executed after the page finished loading (including +any delay defined by 'wait') but before the page is rendered. This allow to +use the javascript code to modify the page being rendered. + +To execute JavaScript code we use a POST request with the content-type set to +'application/javascript'. The body of the request should contain the code to +be executed. + +Curl example:: + + # Render page and modify its title dynamically + curl -X POST -H 'content-type: application/javascript' \ + -d 'document.title="My Title";' \ + 'http://localhost:8050/render.html?url=http://domain.com' + +To get the result of a javascript function executed within page +context use render.json endpoint with script=1 parameter. + +Javascript Profiles +------------------- + +Splash supports "javascript profiles" that allows to preload javascript files. +Javascript files defined in a profile are executed after the page is loaded +and before any javascript code defined in the request. + +The preloaded files can be used in the user's POST'ed code. + +To enable javascript profiles support, run splash server with the +``--js-profiles-path=`` option:: + + python -m splash.server --js-profiles-path=/etc/splash/js-profiles + +Then create a directory with the name of the profile and place inside it the +javascript files to load (note they must be utf-8 encoded). +The files are loaded in the order they appear in the filesystem. +Directory example:: + + /etc/splash/js-profiles/ + mywebsite/ + lib1.js + +To apply this javascript profile add the parameter +``js=mywebsite`` to the request:: + + curl -X POST -H 'content-type: application/javascript' \ + -d 'myfunc("Hello");' \ + 'http://localhost:8050/render.html?js=mywebsite&url=http://domain.com' + +Note that this example assumes that myfunc is a javascript function +defined in lib1.js. + +Javascript Security +------------------- + +If Splash is started with ``--js-cross-domain-access`` option + +:: + + python -m splash.server --js-cross-domain-access + +then javascript code is allowed to access the content of iframes +loaded from a security origin diferent to the original page (browsers usually +disallow that). This feature is useful for scraping, e.g. to extract the +html of a iframe page. An example of its usage: + + curl -X POST -H 'content-type: application/javascript' \ + -d 'function getContents(){ var f = document.getElementById("external"); return f.contentDocument.getElementsByTagName("body")[0].innerHTML; }; getContents();' \ + 'http://localhost:8050/render.html?url=http://domain.com' + +The javascript function 'getContents' will look for a iframe with +the id 'external' and extract its html contents. + +Note that allowing cross origin javascript calls is a potential +security issue, since it is possible that secret information (i.e cookies) +is exposed when this support is enabled; also, some websites don't load +when cross-domain security is disabled, so this feature is OFF by default. + + +.. _proxy profiles: + +Proxy Profiles +============== + +Splash supports "proxy profiles" that allows to set proxy handling rules +per-request using ``proxy`` GET parameter. + +To enable proxy profiles support, run splash server with +``--proxy-profiles-path=`` option:: + + python -m splash.server --proxy-profiles-path=/etc/splash/proxy-profiles + +Then create an INI file with "proxy profile" config inside the +specified folder, e.g. ``/etc/splash/proxy-profiles/mywebsite.ini``. +Example contents of this file:: + + [proxy] + + ; required + host=proxy.crawlera.com + port=8010 + + ; optional, default is no auth + username=username + password=password + + [rules] + ; optional, default ".*" + whitelist= + .*mywebsite\.com.* + + ; optional, default is no blacklist + blacklist= + .*\.js.* + .*\.css.* + .*\.png + +whitelist and blacklist are newline-separated lists of regexes. +If URL matches one of whitelist patterns and matches none of blacklist +patterns, proxy specified in ``[proxy]`` section is used; +no proxy is used otherwise. + +Then, to apply proxy rules according to this profile, +add ``proxy=mywebsite`` parameter to request:: + + curl http://localhost:8050/render.html?url=http://mywebsite.com/page-with-javascript.html&proxy=mywebsite + +If ``default.ini`` profile is present, it will be used when ``proxy`` +GET argument is not specified. If you have ``default.ini`` profile +but don't want to apply it pass ``none`` as ``proxy`` value. + + +Splash as a Proxy +================= + +Splash supports working as HTTP proxy. In this mode all the HTTP requests received +will be proxied and the response will be rendered based in the following HTTP headers: + +X-Splash-render : string : required + The render mode to use, valid modes are: html, png and json. These modes have + the same behavior as the endpoints: render.html, render.png and render.json respectively. + +X-Splash-js-source : string + Allow to execute javascript code same as POST js code to render.html + +X-Splash-timeout : string + Same as 'timeout' argument for render.html + +X-Splash-wait : string + Same as 'wait' argument for render.html + +X-Splash-proxy : string + Same as 'proxy' argument for render.html + +X-Splash-allowed-domains : string + Same as 'allowed_domains' argument for render.html + +X-Splash-viewport : string + Same as 'viewport' argument for render.html + +X-Splash-width : string + Same as 'width' argument for render.png + +X-Splash-height : string + Same as 'height' argument for render.png + +X-Splash-html : string + Same as 'html' argument for render.json + +X-Splash-png : string + Same as 'png' argument for render.json + +X-Splash-iframes : string + Same as 'iframes' argument for render.json + +X-Splash-script : string + Same as 'script' argument for render.json + +X-Splash-console : string + Same as 'console' argument for render.json + + +Splash proxy mode is enabled by default. To disable it run splash +server with ``--disable-proxy`` option:: + + python -m splash.server --disable-proxy + + +Curl examples:: + + # Display json stats + curl -x localhost:8051 -H 'X-Splash-render: json' \ + http://www.domain.com + + # Get the html page and screenshot + curl -x localhost:8051 \ + -H "X-Splash-render: json" \ + -H "X-Splash-html: 1" \ + -H "X-Splash-png: 1" \ + http://www.mywebsite.com + + # Execute JS and return output + curl -x localhost:8051 \ + -H 'X-Splash-render: json' \ + -H 'X-Splash-script: 1' \ + -H 'X-Splash-js-source: function test(x){ return x; } test("abc");' \ + http://www.domain.com + + # Send POST request to site and save screenshot of results + curl -X POST -d '{"key":"val"}' -x localhost:8051 -o screenshot.png \ + -H 'X-Splash-render: png' \ + http://www.domain.com + + +Functional Tests +================ + +Run with:: + + nosetests + + +Stress tests +============ + +There are some stress tests that spawn its own splash server and a mock server +to run tests against. + +To run the stress tests:: + + python -m splash.tests.stress + +Typical output:: + + $ python -m splash.tests.stress + Total requests: 1000 + Concurrency : 50 + Log file : /tmp/splash-stress-48H91h.log + ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + Received/Expected (per status code or error): + 200: 500/500 + 504: 200/200 + 502: 300/300 + diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..7c1c8739a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,242 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Splash.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Splash.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/setup.py b/setup.py index 0b463eefa..b6b61facb 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,34 @@ -from setuptools import setup, find_packages +setup_args = { + 'name': 'splash', + 'version': '1.0', + 'packages': ['splash'], + 'url': 'https://github.com/scrapinghub/splash', + 'description': 'A javascript rendered with a HTTP API', + 'long_description': open('docs/index.rst').read(), + 'author': 'Scrapinghub', + 'maintainer': 'Scrapinghub', + 'maintainer_email': 'info@scrapinghub.com', + 'license': 'BSD', + 'scripts': ['bin/splash'], + 'classifiers': [ + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Environment :: No Input/Output (Daemon)', + 'Topic :: Internet :: WWW/HTTP', + ], +} -setup( - name = 'splash', - version = '1.0', - packages = find_packages(), -) + +try: + from setuptools import setup +except ImportError: + from distutils.core import setup +else: + setup_args['install_requires'] = ['Twisted', 'qt4reactor', 'psutil'] + +setup(**setup_args)