Skip to content

Commit

Permalink
Merge pull request pypa#5838 from uranusjr/htmlpage-extract-breakdown…
Browse files Browse the repository at this point in the history
…-get-page

Refactor _get_html_page() to use exceptions for flow control
  • Loading branch information
cjerdonek authored Oct 11, 2018
2 parents 541ec23 + fc53f71 commit 8dbbe16
Show file tree
Hide file tree
Showing 3 changed files with 291 additions and 82 deletions.
1 change: 1 addition & 0 deletions news/5838.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix content type detection if a directory named like an archive is used as a package source.
210 changes: 128 additions & 82 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,113 @@
logger = logging.getLogger(__name__)


def _get_content_type(url, session):
"""Get the Content-Type of the given url, using a HEAD request"""
def _match_vcs_scheme(url):
"""Look for VCS schemes in the URL.
Returns the matched VCS scheme, or None if there's no match.
"""
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
return scheme
return None


def _is_url_like_archive(url):
"""Return whether the URL looks like an archive.
"""
filename = Link(url).filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
return True
return False


class _NotHTML(Exception):
def __init__(self, content_type, request_desc):
super(_NotHTML, self).__init__(content_type, request_desc)
self.content_type = content_type
self.request_desc = request_desc


def _ensure_html_header(response):
"""Check the Content-Type header to ensure the response contains HTML.
Raises `_NotHTML` if the content type is not text/html.
"""
content_type = response.headers.get("Content-Type", "")
if not content_type.lower().startswith("text/html"):
raise _NotHTML(content_type, response.request.method)


class _NotHTTP(Exception):
pass


def _ensure_html_response(url, session):
"""Send a HEAD request to the URL, and ensure the response contains HTML.
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
`_NotHTML` if the content type is not text/html.
"""
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
if scheme not in {'http', 'https'}:
# FIXME: some warning or something?
# assertion error?
return ''
raise _NotHTTP()

resp = session.head(url, allow_redirects=True)
resp.raise_for_status()

return resp.headers.get("Content-Type", "")
_ensure_html_header(resp)


def _get_html_response(url, session):
"""Access an HTML page with GET, and return the response.
This consists of three parts:
1. If the URL looks suspiciously like an archive, send a HEAD first to
check the Content-Type is HTML, to avoid downloading a large file.
Raise `_NotHTTP` if the content type cannot be determined, or
`_NotHTML` if it is not HTML.
2. Actually perform the request. Raise HTTP exceptions on network failures.
3. Check the Content-Type header to make sure we got HTML, and raise
`_NotHTML` otherwise.
"""
if _is_url_like_archive(url):
_ensure_html_response(url, session=session)

logger.debug('Getting page %s', url)

resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
)
resp.raise_for_status()

# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
_ensure_html_header(resp)

return resp


def _handle_get_page_fail(link, reason, url, meth=None):
Expand All @@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
"_get_html_page() missing 1 required keyword argument: 'session'"
)

url = link.url
url = url.split('#', 1)[0]
url = link.url.split('#', 1)[0]

# Check for VCS schemes that do not support lookup as web pages.
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
logger.debug('Cannot look at %s URL %s', scheme, link)
return None
vcs_scheme = _match_vcs_scheme(url)
if vcs_scheme:
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
return None

try:
filename = link.filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
content_type = _get_content_type(url, session=session)
if content_type.lower().startswith('text/html'):
break
else:
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return
# Tack index.html onto file:// URLs that point to directories
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)

logger.debug('Getting page %s', url)

# Tack index.html onto file:// URLs that point to directories
(scheme, netloc, path, params, query, fragment) = \
urllib_parse.urlparse(url)
if (scheme == 'file' and
os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)

resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
try:
resp = _get_html_response(url, session=session)
except _NotHTTP as exc:
logger.debug(
'Skipping page %s because it looks like an archive, and cannot '
'be checked by HEAD.', link,
)
except _NotHTML as exc:
logger.debug(
'Skipping page %s because the %s request got Content-Type: %s',
link, exc.request_desc, exc.content_type,
)
resp.raise_for_status()

# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
content_type = resp.headers.get('Content-Type', 'unknown')
if not content_type.lower().startswith("text/html"):
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return

inst = HTMLPage(resp.content, resp.url, resp.headers)
except requests.HTTPError as exc:
_handle_get_page_fail(link, exc, url)
except RetryError as exc:
Expand All @@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
except requests.Timeout:
_handle_get_page_fail(link, "timed out", url)
else:
return inst
return HTMLPage(resp.content, resp.url, resp.headers)


class PackageFinder(object):
Expand Down Expand Up @@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name):
continue
seen.add(location)

page = self._get_page(location)
page = _get_html_page(location, session=self.session)
if page is None:
continue

Expand Down Expand Up @@ -796,9 +845,6 @@ def _link_package_versions(self, link, search):

return InstallationCandidate(search.supplied, version, link)

def _get_page(self, link):
return _get_html_page(link, session=self.session)


def egg_info_matches(
egg_info, search_name, link,
Expand Down
Loading

0 comments on commit 8dbbe16

Please sign in to comment.