Skip to content

Commit

Permalink
Added --continue-put and --upload-id, allowing skipping of files or
Browse files Browse the repository at this point in the history
part files during upload when their size and md5sum match the file
being uploaded.
  • Loading branch information
Eugene Brevdo committed Jul 15, 2013
1 parent 4d7039f commit dc071cc
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 26 deletions.
4 changes: 2 additions & 2 deletions S3/Config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class Config(object):
force = False
enable = None
get_continue = False
put_continue = False
upload_id = None
skip_existing = False
recursive = False
acl_public = None
Expand Down Expand Up @@ -172,8 +174,6 @@ def env_config(self):
else:
print_value = data["value"]
debug("env_Config: %s->%s" % (data["key"], print_value))



def option_list(self):
retval = []
Expand Down
100 changes: 86 additions & 14 deletions S3/MultiPart.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
## License: GPL Version 2

import os
import sys
from stat import ST_SIZE
from logging import debug, info, warning, error
from Utils import getTextFromXml, formatSize, unicodise
from Utils import getTextFromXml, getTreeFromXml, formatSize, unicodise, calculateChecksum, parseNodes
from Exceptions import S3UploadError
from collections import defaultdict

class MultiPartUpload(object):

Expand All @@ -22,15 +24,55 @@ def __init__(self, s3, file, uri, headers_baseline = {}):
self.headers_baseline = headers_baseline
self.upload_id = self.initiate_multipart_upload()

def get_parts_information(self, uri, upload_id):
multipart_response = self.s3.list_multipart(uri, upload_id)
tree = getTreeFromXml(multipart_response['data'])

parts = defaultdict(lambda: None)
for elem in parseNodes(tree):
try:
parts[int(elem['PartNumber'])] = {'checksum': elem['ETag'], 'size': elem['Size']}
except KeyError:
pass

return parts

def get_unique_upload_id(self, uri):
upload_id = None
multipart_response = self.s3.get_multipart(uri)
tree = getTreeFromXml(multipart_response['data'])
for mpupload in parseNodes(tree):
try:
mp_upload_id = mpupload['UploadId']
mp_path = mpupload['Key']
info("mp_path: %s, object: %s" % (mp_path, uri.object()))
if mp_path == uri.object():
if upload_id is not None:
raise ValueError("More than one UploadId for URI %s. Disable multipart upload, or use\n %s multipart %s\nto list the Ids, then pass a unique --upload-id into the put command." % (uri, sys.argv[0], uri))
upload_id = mp_upload_id
except KeyError:
pass

return upload_id

def initiate_multipart_upload(self):
"""
Begin a multipart upload
http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadInitiate.html
"""
request = self.s3.create_request("OBJECT_POST", uri = self.uri, headers = self.headers_baseline, extra = "?uploads")
response = self.s3.send_request(request)
data = response["data"]
self.upload_id = getTextFromXml(data, "UploadId")
if self.s3.config.upload_id is not None:
self.upload_id = self.s3.config.upload_id
elif self.s3.config.put_continue:
self.upload_id = self.get_unique_upload_id(self.uri)
else:
self.upload_id = None

if self.upload_id is None:
request = self.s3.create_request("OBJECT_POST", uri = self.uri, headers = self.headers_baseline, extra = "?uploads")
response = self.s3.send_request(request)
data = response["data"]
self.upload_id = getTextFromXml(data, "UploadId")

return self.upload_id

def upload_all_parts(self):
Expand All @@ -51,6 +93,10 @@ def upload_all_parts(self):
else:
debug("MultiPart: Uploading from %s" % (self.file.name))

remote_statuses = None
if self.s3.config.put_continue:
remote_statuses = self.get_parts_information(self.uri, self.upload_id)

seq = 1
if self.file.name != "<stdin>":
while size_left > 0:
Expand All @@ -63,10 +109,10 @@ def upload_all_parts(self):
'extra' : "[part %d of %d, %s]" % (seq, nr_parts, "%d%sB" % formatSize(current_chunk_size, human_readable = True))
}
try:
self.upload_part(seq, offset, current_chunk_size, labels)
self.upload_part(seq, offset, current_chunk_size, labels, remote_status = remote_statuses[seq])
except:
error(u"Upload of '%s' part %d failed. Aborting multipart upload." % (self.file.name, seq))
self.abort_upload()
error(u"\nUpload of '%s' part %d failed. Use\n %s abortmp %s %s\nto abort the upload, or\n %s --upload-id %s put ...\nto continue the upload."
% (self.file.name, seq, sys.argv[0], self.uri, self.upload_id, sys.argv[0], self.upload_id))
raise
seq += 1
else:
Expand All @@ -82,22 +128,37 @@ def upload_all_parts(self):
if len(buffer) == 0: # EOF
break
try:
self.upload_part(seq, offset, current_chunk_size, labels, buffer)
self.upload_part(seq, offset, current_chunk_size, labels, buffer, remote_status = remote_statuses[seq])
except:
error(u"Upload of '%s' part %d failed. Aborting multipart upload." % (self.file.name, seq))
self.abort_upload()
error(u"\nUpload of '%s' part %d failed. Use\n %s abortmp %s %s\nto abort, or\n %s --upload-id %s put ...\nto continue the upload."
% (self.file.name, seq, self.uri, sys.argv[0], self.upload_id, sys.argv[0], self.upload_id))
raise
seq += 1

debug("MultiPart: Upload finished: %d parts", seq - 1)

def upload_part(self, seq, offset, chunk_size, labels, buffer = ''):
def upload_part(self, seq, offset, chunk_size, labels, buffer = '', remote_status = None):
"""
Upload a file chunk
http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadUploadPart.html
"""
# TODO implement Content-MD5
debug("Uploading part %i of %r (%s bytes)" % (seq, self.upload_id, chunk_size))

if remote_status is not None:
if int(remote_status['size']) == chunk_size:
checksum = calculateChecksum(buffer, self.file, offset, chunk_size, self.s3.config.send_chunk)
remote_checksum = remote_status['checksum'].strip('"')
if remote_checksum == checksum:
warning("MultiPart: size and md5sum match for %s part %d, skipping." % (self.uri, seq))
return
else:
warning("MultiPart: checksum (%s vs %s) does not match for %s part %d, reuploading."
% (remote_checksum, checksum, self.uri, seq))
else:
warning("MultiPart: size (%d vs %d) does not match for %s part %d, reuploading."
% (int(remote_status['size']), chunk_size, self.uri, seq))

headers = { "content-length": chunk_size }
query_string = "?partNumber=%i&uploadId=%s" % (seq, self.upload_id)
request = self.s3.create_request("OBJECT_PUT", uri = self.uri, headers = headers, extra = query_string)
Expand Down Expand Up @@ -130,8 +191,19 @@ def abort_upload(self):
http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadAbort.html
"""
debug("MultiPart: Aborting upload: %s" % self.upload_id)
request = self.s3.create_request("OBJECT_DELETE", uri = self.uri, extra = "?uploadId=%s" % (self.upload_id))
response = self.s3.send_request(request)
#request = self.s3.create_request("OBJECT_DELETE", uri = self.uri, extra = "?uploadId=%s" % (self.upload_id))
#response = self.s3.send_request(request)
response = None
return response

# vim:et:ts=4:sts=4:ai










28 changes: 28 additions & 0 deletions S3/S3.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,31 @@ def object_put(self, filename, uri, extra_headers = None, extra_label = ""):
return self.send_file_multipart(file, headers, uri, size)

## Not multipart...
if self.config.put_continue:
# Note, if input was stdin, we would be performing multipart upload.
# So this will always work as long as the file already uploaded was
# not uploaded via MultiUpload, in which case its ETag will not be
# an md5.
try:
info = self.object_info(uri)
except:
info = None

if info is not None:
remote_size = int(info['headers']['content-length'])
remote_checksum = info['headers']['etag'].strip('"')
if size == remote_size:
checksum = calculateChecksum('', file, 0, size, self.config.send_chunk)
if remote_checksum == checksum:
warning("Put: size and md5sum match for %s, skipping." % uri)
return
else:
warning("MultiPart: checksum (%s vs %s) does not match for %s, reuploading."
% (remote_checksum, checksum, uri))
else:
warning("MultiPart: size (%d vs %d) does not match for %s, reuploading."
% (remote_size, size, uri))

headers["content-length"] = size
request = self.create_request("OBJECT_PUT", uri = uri, headers = headers)
labels = { 'source' : unicodise(filename), 'destination' : unicodise(uri.uri()), 'extra' : extra_label }
Expand Down Expand Up @@ -754,13 +779,15 @@ def send_file(self, request, file, labels, buffer = '', throttle = 0, retries =
if buffer == '':
file.seek(offset)
md5_hash = md5()

try:
while (size_left > 0):
#debug("SendFile: Reading up to %d bytes from '%s' - remaining bytes: %s" % (self.config.send_chunk, file.name, size_left))
if buffer == '':
data = file.read(min(self.config.send_chunk, size_left))
else:
data = buffer

md5_hash.update(data)
conn.c.send(data)
if self.config.progress_meter:
Expand All @@ -769,6 +796,7 @@ def send_file(self, request, file, labels, buffer = '', throttle = 0, retries =
if throttle:
time.sleep(throttle)
md5_computed = md5_hash.hexdigest()

response = {}
http_response = conn.c.getresponse()
response["status"] = http_response.status
Expand Down
18 changes: 18 additions & 0 deletions S3/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,4 +459,22 @@ def getHostnameFromBucket(bucket):
return Config.Config().host_bucket % { 'bucket' : bucket }
__all__.append("getHostnameFromBucket")


def calculateChecksum(buffer, mfile, offset, chunk_size, send_chunk):
md5_hash = md5()
size_left = chunk_size
if buffer == '':
mfile.seek(offset)
while size_left > 0:
data = mfile.read(min(send_chunk, size_left))
md5_hash.update(data)
size_left -= len(data)
else:
md5_hash.update(buffer)

return md5_hash.hexdigest()


__all__.append("calculateChecksum")

# vim:et:ts=4:sts=4:ai
31 changes: 21 additions & 10 deletions s3cmd
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,15 @@ def cmd_object_put(args):
except InvalidFileError, e:
warning(u"File can not be uploaded: %s" % e)
continue
speed_fmt = formatSize(response["speed"], human_readable = True, floating_point = True)
if not Config().progress_meter:
output(u"File '%s' stored as '%s' (%d bytes in %0.1f seconds, %0.2f %sB/s) %s" %
(unicodise(full_name_orig), uri_final, response["size"], response["elapsed"],
speed_fmt[0], speed_fmt[1], seq_label))
if response is not None:
speed_fmt = formatSize(response["speed"], human_readable = True, floating_point = True)
if not Config().progress_meter:
output(u"File '%s' stored as '%s' (%d bytes in %0.1f seconds, %0.2f %sB/s) %s" %
(unicodise(full_name_orig), uri_final, response["size"], response["elapsed"],
speed_fmt[0], speed_fmt[1], seq_label))
if Config().acl_public:
output(u"Public URL of the object is: %s" %
(uri_final.public_url()))
(uri_final.public_url()))
if Config().encrypt and full_name != full_name_orig:
debug(u"Removing temporary encrypted file: %s" % unicodise(full_name))
os.remove(full_name)
Expand Down Expand Up @@ -1268,7 +1269,7 @@ def cmd_delpolicy(args):
def cmd_multipart(args):
s3 = S3(cfg)
uri = S3Uri(args[0])

#id = ''
#if(len(args) > 1): id = args[1]

Expand All @@ -1277,11 +1278,11 @@ def cmd_multipart(args):
output(u"%s" % uri)
tree = getTreeFromXml(response['data'])
debug(parseNodes(tree))
output(u"Initiated\tId\tPath")
output(u"Initiated\tPath\tId")
for mpupload in parseNodes(tree):
try:
output("%s\t%s\t%s" % (mpupload['Initiated'], mpupload['UploadId'], mpupload['Key']))
except:
output("%s\t%s\t%s" % (mpupload['Initiated'], "s3://" + uri.bucket() + "/" + mpupload['Key'], mpupload['UploadId']))
except KeyError:
pass

def cmd_abort_multipart(args):
Expand Down Expand Up @@ -1808,6 +1809,8 @@ def main():
optparser.add_option( "--no-encrypt", dest="encrypt", action="store_false", help="Don't encrypt files.")
optparser.add_option("-f", "--force", dest="force", action="store_true", help="Force overwrite and other dangerous operations.")
optparser.add_option( "--continue", dest="get_continue", action="store_true", help="Continue getting a partially downloaded file (only for [get] command).")
optparser.add_option( "--continue-put", dest="put_continue", action="store_true", help="Continue uploading partially uploaded files or multipart upload parts. Restarts/parts files that don't have matching size and md5. Skips files/parts that do. Note: md5sum checks are not always sufficient to check (part) file equality. Enable this at your own risk.")
optparser.add_option( "--upload-id", dest="upload_id", help="UploadId for Multipart Upload, in case you want continue an existing upload (equivalent to --continue-put) and there are multiple partial uploads. Use s3cmd multipart [URI] to see what UploadIds are associated with the given URI.")
optparser.add_option( "--skip-existing", dest="skip_existing", action="store_true", help="Skip over files that exist at the destination (only for [get] and [sync] commands).")
optparser.add_option("-r", "--recursive", dest="recursive", action="store_true", help="Recursive upload, download or removal.")
optparser.add_option( "--check-md5", dest="check_md5", action="store_true", help="Check MD5 sums when comparing files for [sync]. (default)")
Expand Down Expand Up @@ -1997,6 +2000,14 @@ def main():
if cfg.multipart_chunk_size_mb > MultiPartUpload.MAX_CHUNK_SIZE_MB:
raise ParameterError("Chunk size %d MB is too large, must be <= %d MB. Please adjust --multipart-chunk-size-mb" % (cfg.multipart_chunk_size_mb, MultiPartUpload.MAX_CHUNK_SIZE_MB))

## If an UploadId was provided, set put_continue True
if options.upload_id is not None:
cfg.upload_id = options.upload_id
cfg.put_continue = True

if cfg.put_continue and not cfg.multipart_chunk_size_mb:
raise ParameterError("Must have --multipart-chunk-size-mb if using --put-continue or --upload-id")

## CloudFront's cf_enable and Config's enable share the same --enable switch
options.cf_enable = options.enable

Expand Down

0 comments on commit dc071cc

Please sign in to comment.