Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mixcloud] Add support for new frontend #14132

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion youtube_dl/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import email
import getpass
import io
import itertools
import optparse
import os
import re
Expand All @@ -15,7 +16,6 @@
import struct
import subprocess
import sys
import itertools
import xml.etree.ElementTree


Expand Down Expand Up @@ -2898,6 +2898,13 @@ def compat_struct_unpack(spec, *args):
compat_struct_pack = struct.pack
compat_struct_unpack = struct.unpack

try:
from future_builtins import zip as compat_zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as compat_zip # < 2.5 or 3.x
except ImportError:
compat_zip = zip

__all__ = [
'compat_HTMLParseError',
Expand Down Expand Up @@ -2948,5 +2955,6 @@ def compat_struct_unpack(spec, *args):
'compat_urlretrieve',
'compat_xml_parse_error',
'compat_xpath',
'compat_zip',
'workaround_optparse_bug9161',
]
167 changes: 105 additions & 62 deletions youtube_dl/extractor/mixcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
from ..compat import (
compat_chr,
compat_ord,
compat_str,
compat_urllib_parse_unquote,
compat_urlparse,
compat_zip
)
from ..utils import (
clean_html,
ExtractorError,
OnDemandPagedList,
str_to_int,
)
try_get)


class MixcloudIE(InfoExtractor):
Expand Down Expand Up @@ -54,27 +54,19 @@ class MixcloudIE(InfoExtractor):
'only_matching': True,
}]

_keys = [
'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };',
'pleasedontdownloadourmusictheartistswontgetpaid',
'window.addEventListener = window.addEventListener || function() {};',
'(function() { return new Date().toLocaleDateString(); })()'
]
_current_key = None

# See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
def _decrypt_play_info(self, play_info, video_id):
play_info = base64.b64decode(play_info.encode('ascii'))
for num, key in enumerate(self._keys, start=1):
try:
return self._parse_json(
''.join([
compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)]))
for idx, ch in enumerate(play_info)]),
video_id)
except ExtractorError:
if num == len(self._keys):
raise
@staticmethod
def _decrypt_xor_cipher(key, ciphertext):
"""Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
return ''.join([
compat_chr(compat_ord(ch) ^ compat_ord(k))
for ch, k in compat_zip(ciphertext, itertools.cycle(key))])

@staticmethod
def _decrypt_and_extend(stream_info, url_key, getter, key, formats):
maybe_url = stream_info.get(url_key)
if maybe_url is not None:
decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url))
formats.extend(getter(decrypted))

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
Expand All @@ -84,54 +76,105 @@ def _real_extract(self, url):

webpage = self._download_webpage(url, track_id)

if not self._current_key:
js_url = self._search_regex(
r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
webpage, 'js url', default=None)
if js_url:
js = self._download_webpage(js_url, track_id, fatal=False)
if js:
KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'):
key = self._search_regex(
KEY_RE_TEMPLATE % key_name, js, 'key',
default=None, group='key')
if key and isinstance(key, compat_str):
self._keys.insert(0, key)
self._current_key = key
# Legacy path
encrypted_play_info = self._search_regex(
r'm-play-info="([^"]+)"', webpage, 'play info', default=None)

if encrypted_play_info is not None:
# Decode
encrypted_play_info = base64.b64decode(encrypted_play_info)
else:
# New path
full_info_json = self._parse_json(self._html_search_regex(
r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', webpage, 'play info'), 'play info')
for item in full_info_json:
item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup'])
if try_get(item_data, lambda x: x['streamInfo']['url']):
info_json = item_data
break
else:
raise ExtractorError('Failed to extract matching stream info')

message = self._html_search_regex(
r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
webpage, 'error message', default=None)

encrypted_play_info = self._search_regex(
r'm-play-info="([^"]+)"', webpage, 'play info')

play_info = self._decrypt_play_info(encrypted_play_info, track_id)

if message and 'stream_url' not in play_info:
raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)

song_url = play_info['stream_url']

title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
thumbnail = self._proto_relative_url(self._html_search_regex(
r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False))
uploader = self._html_search_regex(
r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False)
uploader_id = self._search_regex(
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>',
r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
webpage, 'play count', default=None))
js_url = self._search_regex(
r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
webpage, 'js url', default=None)
if js_url is None:
js_url = self._search_regex(
r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)',
webpage, 'js url')
js = self._download_webpage(js_url, track_id)
# Known plaintext attack
if encrypted_play_info:
kps = ['{"stream_url":']
kpa_target = encrypted_play_info
else:
kps = ['https://', 'http://']
kpa_target = base64.b64decode(info_json['streamInfo']['url'])
for kp in kps:
partial_key = self._decrypt_xor_cipher(kpa_target, kp)
for quote in ["'", '"']:
key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js,
"encryption key", default=None)
if key is not None:
break
else:
continue
break
else:
raise ExtractorError('Failed to extract encryption key')

if encrypted_play_info is not None:
play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info')
if message and 'stream_url' not in play_info:
raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
song_url = play_info['stream_url']
formats = [{
'format_id': 'normal',
'url': song_url
}]

title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
thumbnail = self._proto_relative_url(self._html_search_regex(
r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False))
uploader = self._html_search_regex(
r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False)
uploader_id = self._search_regex(
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>',
r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
webpage, 'play count', default=None))

else:
title = info_json['name']
thumbnail = try_get(info_json,
lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot'])
uploader = try_get(info_json, lambda x: x['owner']['displayName'])
uploader_id = try_get(info_json, lambda x: x['owner']['username'])
description = try_get(info_json, lambda x: x['description'])
view_count = try_get(info_json, lambda x: x['plays'])

stream_info = info_json['streamInfo']
formats = []
self._decrypt_and_extend(stream_info, 'url', lambda x: [{
'format_id': 'normal',
'url': x
}], key, formats)
self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key,
formats)
self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key,
formats)

return {
'id': track_id,
'title': title,
'url': song_url,
'formats': formats,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
Expand Down