Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tiktok] fix extraction #22838

Closed
wants to merge 12 commits into from
5 changes: 1 addition & 4 deletions youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,10 +1127,7 @@
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,
TikTokUserIE,
)
from .tiktok import TikTokIE
from .tinypic import TinyPicIE
from .tmz import (
TMZIE,
Expand Down
207 changes: 104 additions & 103 deletions youtube_dl/extractor/tiktok.py
Original file line number Diff line number Diff line change
@@ -1,138 +1,139 @@
# coding: utf-8
from __future__ import unicode_literals
from datetime import datetime

from .common import InfoExtractor
from ..utils import (
compat_str,
ExtractorError,
int_or_none,
str_or_none,
try_get,
url_or_none,
try_get
)


class TikTokBaseIE(InfoExtractor):
def _extract_aweme(self, data):
video = data['video']
description = str_or_none(try_get(data, lambda x: x['desc']))
width = int_or_none(try_get(data, lambda x: video['width']))
height = int_or_none(try_get(data, lambda x: video['height']))
def _extract_aweme(self, video_data, webpage):
video_info = try_get(
video_data, lambda x: x['videoData']['itemInfos'], dict)
author_info = try_get(
video_data, lambda x: x['videoData']['authorInfos'], dict)
share_info = try_get(video_data, lambda x: x['shareMeta'], dict)

format_urls = set()
formats = []
for format_id in (
'play_addr_lowbr', 'play_addr', 'play_addr_h264',
'download_addr'):
for format in try_get(
video, lambda x: x[format_id]['url_list'], list) or []:
format_url = url_or_none(format)
if not format_url:
continue
if format_url in format_urls:
continue
format_urls.add(format_url)
formats.append({
'url': format_url,
'ext': 'mp4',
'height': height,
'width': width,
})
self._sort_formats(formats)
unique_id = str_or_none(author_info.get('uniqueId'))
timestamp = try_get(video_info, lambda x: int(x['createTime']), int)
date = datetime.fromtimestamp(timestamp).strftime('%Y%m%d')

thumbnail = url_or_none(try_get(
video, lambda x: x['cover']['url_list'][0], compat_str))
uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
timestamp = int_or_none(data.get('create_time'))
comment_count = int_or_none(data.get('comment_count')) or int_or_none(
try_get(data, lambda x: x['statistics']['comment_count']))
repost_count = int_or_none(try_get(
data, lambda x: x['statistics']['share_count']))
height = try_get(video_info, lambda x: x['video']['videoMeta']['height'], int)
width = try_get(video_info, lambda x: x['video']['videoMeta']['width'], int)
thumbnails = []
thumbnails.append({
'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage),
'width': width,
'height': height
})

aweme_id = data['aweme_id']
formats = []
formats.append({
'url': try_get(video_info, lambda x: x['video']['urls'][0]),
'ext': 'mp4',
'height': height,
'width': width,
'http_headers': {
'Referer': self._og_search_url(webpage),
},
})

return {
'id': aweme_id,
'title': uploader or aweme_id,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'comment_count': int_or_none(video_info.get('commentCount')),
'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int),
'height': height,
'id': str_or_none(video_info.get('id')),
'like_count': int_or_none(video_info.get('diggCount')),
'repost_count': int_or_none(video_info.get('shareCount')),
'thumbnail': try_get(video_info, lambda x: x['covers'][0]),
'timestamp': timestamp,
'comment_count': comment_count,
'repost_count': repost_count,
'formats': formats,
'width': width,
'title': str_or_none(share_info.get('title')) or self._og_search_title(webpage),
'creator': str_or_none(author_info.get('nickName')),
'uploader': unique_id,
'uploader_id': str_or_none(author_info.get('userId')),
'uploader_url': 'https://www.tiktok.com/@' + unique_id,
'thumbnails': thumbnails,
'upload_date': date,
skyme5 marked this conversation as resolved.
Show resolved Hide resolved
'webpage_url': self._og_search_url(webpage),
'description': str_or_none(video_info.get('text')) or str_or_none(share_info.get('desc')),
'ext': 'mp4',
'formats': formats
}


class TikTokIE(TikTokBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:m\.)?tiktok\.com/v|
(?:www\.)?tiktok\.com/share/video
)
/(?P<id>\d+)
'''
_VALID_URL = r'https?://www\.tiktok\.com/@[\w\._]+/video/(?P<id>\d+)'

_TESTS = [{
'url': 'https://m.tiktok.com/v/6606727368545406213.html',
'md5': 'd584b572e92fcd48888051f238022420',
'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
'md5': '34a7543afd5a151b0840ba6736fb633b',
'info_dict': {
'id': '6606727368545406213',
'ext': 'mp4',
'title': 'Zureeal',
'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
'thumbnail': r're:^https?://.*~noop.image',
'uploader': 'Zureeal',
'timestamp': 1538248586,
'upload_date': '20180929',
'comment_count': int,
'creator': 'facestoriesbyleenabh',
'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95',
'duration': 13,
'ext': 'mp4',
'formats': list,
'height': 1280,
'id': '6748451240264420610',
'like_count': int,
'repost_count': int,
'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
'thumbnails': list,
'timestamp': 1571246252,
'title': 'facestoriesbyleenabh on TikTok',
'upload_date': '20191016',
'uploader': 'leenabhushan',
'uploader_id': '6691488002098119685',
'uploader_url': r're:https://www.tiktok.com/@leenabhushan',
'webpage_url': r're:https://www.tiktok.com/@leenabhushan/(video/)?6748451240264420610',
'width': 720,
}
}, {
'url': 'https://www.tiktok.com/share/video/6606727368545406213',
'only_matching': True,
'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
skyme5 marked this conversation as resolved.
Show resolved Hide resolved
'md5': '06b9800d47d5fe51a19e322dd86e61c9',
'info_dict': {
'comment_count': int,
'creator': 'patroX',
'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
'duration': 27,
'ext': 'mp4',
'formats': list,
'height': 960,
'id': '6742501081818877190',
'like_count': int,
'repost_count': int,
'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
'thumbnails': list,
'timestamp': 1569860870,
'title': 'patroX on TikTok',
'upload_date': '20190930',
'uploader': 'patroxofficial',
'uploader_id': '18702747',
'uploader_url': r're:https://www.tiktok.com/@patroxofficial',
'webpage_url': r're:https://www.tiktok.com/@patroxofficial/(video/)?6742501081818877190',
'width': 540,
}
}]

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://m.tiktok.com/v/%s.html' % video_id, video_id)
data = self._parse_json(self._search_regex(
r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id)
return self._extract_aweme(data)

webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
json_string = self._search_regex(
r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)',
webpage, 'json_string', group='json_string_ld')
json_data = self._parse_json(json_string, video_id)
video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict)

class TikTokUserIE(TikTokBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:m\.)?tiktok\.com/h5/share/usr|
(?:www\.)?tiktok\.com/share/user
)
/(?P<id>\d+)
'''
_TESTS = [{
'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html',
'info_dict': {
'id': '188294915489964032',
},
'playlist_mincount': 24,
}, {
'url': 'https://www.tiktok.com/share/user/188294915489964032',
'only_matching': True,
}]
# Chech statusCode for success
if video_data.get('statusCode') == 0:
return self._extract_aweme(video_data, webpage)

def _real_extract(self, url):
user_id = self._match_id(url)
data = self._download_json(
'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id,
query={'_signature': '_'})
entries = []
for aweme in data['aweme_list']:
try:
entry = self._extract_aweme(aweme)
except ExtractorError:
continue
entry['extractor_key'] = TikTokIE.ie_key()
entries.append(entry)
return self.playlist_result(entries, user_id)
raise ExtractorError('Video not available', video_id=video_id)