Added --continue-put and --upload-id, allowing skipping of files or

part files during upload when their size and md5sum match the file being uploaded.
shahmirwahid94 · Jul 15, 2013 · dc071cc · dc071cc
1 parent 4d7039f
commit dc071cc
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 26 deletions.
diff --git a/S3/Config.py b/S3/Config.py
@@ -35,6 +35,8 @@ class Config(object):
     force = False
     enable = None
     get_continue = False
+    put_continue = False
+    upload_id = None
     skip_existing = False
     recursive = False
     acl_public = None
@@ -172,8 +174,6 @@ def env_config(self):
                         else:
                             print_value = data["value"]
                         debug("env_Config: %s->%s" % (data["key"], print_value))
-
-
 
     def option_list(self):
         retval = []

diff --git a/S3/MultiPart.py b/S3/MultiPart.py
@@ -3,10 +3,12 @@
 ## License: GPL Version 2
 
 import os
+import sys
 from stat import ST_SIZE
 from logging import debug, info, warning, error
-from Utils import getTextFromXml, formatSize, unicodise
+from Utils import getTextFromXml, getTreeFromXml, formatSize, unicodise, calculateChecksum, parseNodes
 from Exceptions import S3UploadError
+from collections import defaultdict
 
 class MultiPartUpload(object):
 
@@ -22,15 +24,55 @@ def __init__(self, s3, file, uri, headers_baseline = {}):
         self.headers_baseline = headers_baseline
         self.upload_id = self.initiate_multipart_upload()
 
+    def get_parts_information(self, uri, upload_id):
+        multipart_response = self.s3.list_multipart(uri, upload_id)
+        tree = getTreeFromXml(multipart_response['data'])
+
+        parts = defaultdict(lambda: None)
+        for elem in parseNodes(tree):
+            try:
+                parts[int(elem['PartNumber'])] = {'checksum': elem['ETag'], 'size': elem['Size']}
+            except KeyError:
+                pass
+
+        return parts
+
+    def get_unique_upload_id(self, uri):
+        upload_id = None
+        multipart_response = self.s3.get_multipart(uri)
+        tree = getTreeFromXml(multipart_response['data'])
+        for mpupload in parseNodes(tree):
+            try:
+                mp_upload_id = mpupload['UploadId']
+                mp_path = mpupload['Key']
+                info("mp_path: %s, object: %s" % (mp_path, uri.object()))
+                if mp_path == uri.object():
+                    if upload_id is not None:
+                        raise ValueError("More than one UploadId for URI %s.  Disable multipart upload, or use\n %s multipart %s\nto list the Ids, then pass a unique --upload-id into the put command." % (uri, sys.argv[0], uri))
+                    upload_id = mp_upload_id
+            except KeyError:
+                pass
+
+        return upload_id
+
     def initiate_multipart_upload(self):
         """
         Begin a multipart upload
         http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadInitiate.html
         """
-        request = self.s3.create_request("OBJECT_POST", uri = self.uri, headers = self.headers_baseline, extra = "?uploads")
-        response = self.s3.send_request(request)
-        data = response["data"]
-        self.upload_id = getTextFromXml(data, "UploadId")
+        if self.s3.config.upload_id is not None:
+            self.upload_id = self.s3.config.upload_id
+        elif self.s3.config.put_continue:
+            self.upload_id = self.get_unique_upload_id(self.uri)
+        else:
+            self.upload_id = None
+
+        if self.upload_id is None:
+            request = self.s3.create_request("OBJECT_POST", uri = self.uri, headers = self.headers_baseline, extra = "?uploads")
+            response = self.s3.send_request(request)
+            data = response["data"]
+            self.upload_id = getTextFromXml(data, "UploadId")
+
         return self.upload_id
 
     def upload_all_parts(self):
@@ -51,6 +93,10 @@ def upload_all_parts(self):
         else:
             debug("MultiPart: Uploading from %s" % (self.file.name))
 
+        remote_statuses = None
+        if self.s3.config.put_continue:
+            remote_statuses = self.get_parts_information(self.uri, self.upload_id)
+
         seq = 1
         if self.file.name != "<stdin>":
             while size_left > 0:
@@ -63,10 +109,10 @@ def upload_all_parts(self):
                     'extra' : "[part %d of %d, %s]" % (seq, nr_parts, "%d%sB" % formatSize(current_chunk_size, human_readable = True))
                 }
                 try:
-                    self.upload_part(seq, offset, current_chunk_size, labels)
+                    self.upload_part(seq, offset, current_chunk_size, labels, remote_status = remote_statuses[seq])
                 except:
-                    error(u"Upload of '%s' part %d failed. Aborting multipart upload." % (self.file.name, seq))
-                    self.abort_upload()
+                    error(u"\nUpload of '%s' part %d failed. Use\n  %s abortmp %s %s\nto abort the upload, or\n  %s --upload-id %s put ...\nto continue the upload."
+                          % (self.file.name, seq, sys.argv[0], self.uri, self.upload_id, sys.argv[0], self.upload_id))
                     raise
                 seq += 1
         else:
@@ -82,22 +128,37 @@ def upload_all_parts(self):
                 if len(buffer) == 0: # EOF
                     break
                 try:
-                    self.upload_part(seq, offset, current_chunk_size, labels, buffer)
+                    self.upload_part(seq, offset, current_chunk_size, labels, buffer, remote_status = remote_statuses[seq])
                 except:
-                    error(u"Upload of '%s' part %d failed. Aborting multipart upload." % (self.file.name, seq))
-                    self.abort_upload()
+                    error(u"\nUpload of '%s' part %d failed. Use\n  %s abortmp %s %s\nto abort, or\n  %s --upload-id %s put ...\nto continue the upload."
+                          % (self.file.name, seq, self.uri, sys.argv[0], self.upload_id, sys.argv[0], self.upload_id))
                     raise
                 seq += 1
 
         debug("MultiPart: Upload finished: %d parts", seq - 1)
 
-    def upload_part(self, seq, offset, chunk_size, labels, buffer = ''):
+    def upload_part(self, seq, offset, chunk_size, labels, buffer = '', remote_status = None):
         """
         Upload a file chunk
         http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadUploadPart.html
         """
         # TODO implement Content-MD5
         debug("Uploading part %i of %r (%s bytes)" % (seq, self.upload_id, chunk_size))
+
+        if remote_status is not None:
+            if int(remote_status['size']) == chunk_size:
+                checksum = calculateChecksum(buffer, self.file, offset, chunk_size, self.s3.config.send_chunk)
+                remote_checksum = remote_status['checksum'].strip('"')
+                if remote_checksum == checksum:
+                    warning("MultiPart: size and md5sum match for %s part %d, skipping." % (self.uri, seq))
+                    return
+                else:
+                    warning("MultiPart: checksum (%s vs %s) does not match for %s part %d, reuploading."
+                            % (remote_checksum, checksum, self.uri, seq))
+            else:
+                warning("MultiPart: size (%d vs %d) does not match for %s part %d, reuploading."
+                        % (int(remote_status['size']), chunk_size, self.uri, seq))
+
         headers = { "content-length": chunk_size }
         query_string = "?partNumber=%i&uploadId=%s" % (seq, self.upload_id)
         request = self.s3.create_request("OBJECT_PUT", uri = self.uri, headers = headers, extra = query_string)
@@ -130,8 +191,19 @@ def abort_upload(self):
         http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadAbort.html
         """
         debug("MultiPart: Aborting upload: %s" % self.upload_id)
-        request = self.s3.create_request("OBJECT_DELETE", uri = self.uri, extra = "?uploadId=%s" % (self.upload_id))
-        response = self.s3.send_request(request)
+        #request = self.s3.create_request("OBJECT_DELETE", uri = self.uri, extra = "?uploadId=%s" % (self.upload_id))
+        #response = self.s3.send_request(request)
+        response = None
         return response
 
 # vim:et:ts=4:sts=4:ai
+
+
+
+
+
+
+
+
+
+
diff --git a/S3/S3.py b/S3/S3.py
@@ -439,6 +439,31 @@ def object_put(self, filename, uri, extra_headers = None, extra_label = ""):
             return self.send_file_multipart(file, headers, uri, size)
 
         ## Not multipart...
+        if self.config.put_continue:
+            # Note, if input was stdin, we would be performing multipart upload.
+            # So this will always work as long as the file already uploaded was
+            # not uploaded via MultiUpload, in which case its ETag will not be
+            # an md5.
+            try:
+                info = self.object_info(uri)
+            except:
+                info = None
+
+            if info is not None:
+                remote_size = int(info['headers']['content-length'])
+                remote_checksum = info['headers']['etag'].strip('"')
+                if size == remote_size:
+                    checksum = calculateChecksum('', file, 0, size, self.config.send_chunk)
+                    if remote_checksum == checksum:
+                        warning("Put: size and md5sum match for %s, skipping." % uri)
+                        return
+                    else:
+                        warning("MultiPart: checksum (%s vs %s) does not match for %s, reuploading."
+                                % (remote_checksum, checksum, uri))
+                else:
+                    warning("MultiPart: size (%d vs %d) does not match for %s, reuploading."
+                            % (remote_size, size, uri))
+
         headers["content-length"] = size
         request = self.create_request("OBJECT_PUT", uri = uri, headers = headers)
         labels = { 'source' : unicodise(filename), 'destination' : unicodise(uri.uri()), 'extra' : extra_label }
@@ -754,13 +779,15 @@ def send_file(self, request, file, labels, buffer = '', throttle = 0, retries =
         if buffer == '':
             file.seek(offset)
         md5_hash = md5()
+
         try:
             while (size_left > 0):
                 #debug("SendFile: Reading up to %d bytes from '%s' - remaining bytes: %s" % (self.config.send_chunk, file.name, size_left))
                 if buffer == '':
                     data = file.read(min(self.config.send_chunk, size_left))
                 else:
                     data = buffer
+
                 md5_hash.update(data)
                 conn.c.send(data)
                 if self.config.progress_meter:
@@ -769,6 +796,7 @@ def send_file(self, request, file, labels, buffer = '', throttle = 0, retries =
                 if throttle:
                     time.sleep(throttle)
             md5_computed = md5_hash.hexdigest()
+
             response = {}
             http_response = conn.c.getresponse()
             response["status"] = http_response.status

diff --git a/S3/Utils.py b/S3/Utils.py
@@ -459,4 +459,22 @@ def getHostnameFromBucket(bucket):
     return Config.Config().host_bucket % { 'bucket' : bucket }
 __all__.append("getHostnameFromBucket")
 
+
+def calculateChecksum(buffer, mfile, offset, chunk_size, send_chunk):
+    md5_hash = md5()
+    size_left = chunk_size
+    if buffer == '':
+        mfile.seek(offset)
+        while size_left > 0:
+            data = mfile.read(min(send_chunk, size_left))
+            md5_hash.update(data)
+            size_left -= len(data)
+    else:
+        md5_hash.update(buffer)
+
+    return md5_hash.hexdigest()
+
+
+__all__.append("calculateChecksum")
+
 # vim:et:ts=4:sts=4:ai
diff --git a/s3cmd b/s3cmd
@@ -325,14 +325,15 @@ def cmd_object_put(args):
         except InvalidFileError, e:
             warning(u"File can not be uploaded: %s" % e)
             continue
-        speed_fmt = formatSize(response["speed"], human_readable = True, floating_point = True)
-        if not Config().progress_meter:
-            output(u"File '%s' stored as '%s' (%d bytes in %0.1f seconds, %0.2f %sB/s) %s" %
-                (unicodise(full_name_orig), uri_final, response["size"], response["elapsed"],
-                speed_fmt[0], speed_fmt[1], seq_label))
+        if response is not None:
+            speed_fmt = formatSize(response["speed"], human_readable = True, floating_point = True)
+            if not Config().progress_meter:
+                output(u"File '%s' stored as '%s' (%d bytes in %0.1f seconds, %0.2f %sB/s) %s" %
+                       (unicodise(full_name_orig), uri_final, response["size"], response["elapsed"],
+                        speed_fmt[0], speed_fmt[1], seq_label))
         if Config().acl_public:
             output(u"Public URL of the object is: %s" %
-                (uri_final.public_url()))
+                   (uri_final.public_url()))
         if Config().encrypt and full_name != full_name_orig:
             debug(u"Removing temporary encrypted file: %s" % unicodise(full_name))
             os.remove(full_name)
@@ -1268,7 +1269,7 @@ def cmd_delpolicy(args):
 def cmd_multipart(args):
     s3 = S3(cfg)
     uri = S3Uri(args[0])
-    
+
     #id = ''
     #if(len(args) > 1): id = args[1]
 
@@ -1277,11 +1278,11 @@ def cmd_multipart(args):
     output(u"%s" % uri)
     tree = getTreeFromXml(response['data'])
     debug(parseNodes(tree))
-    output(u"Initiated\tId\tPath")
+    output(u"Initiated\tPath\tId")
     for mpupload in parseNodes(tree):
         try:
-            output("%s\t%s\t%s" % (mpupload['Initiated'], mpupload['UploadId'], mpupload['Key']))
-        except:
+            output("%s\t%s\t%s" % (mpupload['Initiated'], "s3://" + uri.bucket() + "/" + mpupload['Key'], mpupload['UploadId']))
+        except KeyError:
             pass
 
 def cmd_abort_multipart(args):
@@ -1808,6 +1809,8 @@ def main():
     optparser.add_option(      "--no-encrypt", dest="encrypt", action="store_false", help="Don't encrypt files.")
     optparser.add_option("-f", "--force", dest="force", action="store_true", help="Force overwrite and other dangerous operations.")
     optparser.add_option(      "--continue", dest="get_continue", action="store_true", help="Continue getting a partially downloaded file (only for [get] command).")
+    optparser.add_option(      "--continue-put", dest="put_continue", action="store_true", help="Continue uploading partially uploaded files or multipart upload parts.  Restarts/parts files that don't have matching size and md5.  Skips files/parts that do.  Note: md5sum checks are not always sufficient to check (part) file equality.  Enable this at your own risk.")
+    optparser.add_option(      "--upload-id", dest="upload_id", help="UploadId for Multipart Upload, in case you want continue an existing upload (equivalent to --continue-put) and there are multiple partial uploads.  Use s3cmd multipart [URI] to see what UploadIds are associated with the given URI.")
     optparser.add_option(      "--skip-existing", dest="skip_existing", action="store_true", help="Skip over files that exist at the destination (only for [get] and [sync] commands).")
     optparser.add_option("-r", "--recursive", dest="recursive", action="store_true", help="Recursive upload, download or removal.")
     optparser.add_option(      "--check-md5", dest="check_md5", action="store_true", help="Check MD5 sums when comparing files for [sync]. (default)")
@@ -1997,6 +2000,14 @@ def main():
     if cfg.multipart_chunk_size_mb > MultiPartUpload.MAX_CHUNK_SIZE_MB:
         raise ParameterError("Chunk size %d MB is too large, must be <= %d MB. Please adjust --multipart-chunk-size-mb" % (cfg.multipart_chunk_size_mb, MultiPartUpload.MAX_CHUNK_SIZE_MB))
 
+    ## If an UploadId was provided, set put_continue True
+    if options.upload_id is not None:
+        cfg.upload_id = options.upload_id
+        cfg.put_continue = True
+
+    if cfg.put_continue and not cfg.multipart_chunk_size_mb:
+        raise ParameterError("Must have --multipart-chunk-size-mb if using --put-continue or --upload-id")
+
     ## CloudFront's cf_enable and Config's enable share the same --enable switch
     options.cf_enable = options.enable