2016-10-31 17:14:34 +00:00
#!/usr/bin/python
# This file is part of Ansible
#
# Ansible is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Ansible is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
2016-12-16 14:28:45 +00:00
ANSIBLE_METADATA = { ' status ' : [ ' preview ' ] ,
' supported_by ' : ' community ' ,
' version ' : ' 1.0 ' }
2016-10-31 17:14:34 +00:00
DOCUMENTATION = '''
- - -
module : s3_sync
short_description : Efficiently upload multiple files to S3
description :
- The S3 module is great , but it is very slow for a large volume of files - even a dozen will be noticeable . In addition to speed , it handles globbing , inclusions / exclusions , mime types , expiration mapping , recursion , and smart directory mapping .
version_added : " 2.3 "
options :
mode :
description :
- sync direction .
required : true
default : ' push '
choices : [ push ]
file_change_strategy :
description :
- Difference determination method to allow changes - only syncing . Unlike rsync , files are not patched - they are fully skipped or fully uploaded .
- date_size will upload if file sizes don ' t match or if local file modified date is newer than s3 ' s version
- checksum will compare etag values based on s3 ' s implementation of chunked md5s.
- force will always upload all files .
required : false
default : ' date_size '
choices : [ force , checksum , date_size ]
bucket :
description :
- Bucket name .
required : true
key_prefix :
description :
- In addition to file path , prepend s3 path with this prefix . Module will add slash at end of prefix if necessary .
required : false
file_root :
description :
- File / directory path for synchronization . This is a local path .
- This root path is scrubbed from the key name , so subdirectories will remain as keys .
required : true
permission :
description :
- Canned ACL to apply to synced files .
- Changing this ACL only changes newly synced files , it does not trigger a full reupload .
required : false
choices : [ ' ' , private , public - read , public - read - write , authenticated - read , aws - exec - read , bucket - owner - read , bucket - owner - full - control ]
mime_map :
description :
2017-03-09 16:20:25 +00:00
- ' Dict entry from extension to MIME type. This will override any default/sniffed MIME type. For example C( { " .txt " : " application/text " , " .yml " : " appication/text " }) '
2016-10-31 17:14:34 +00:00
required : false
include :
description :
- Shell pattern - style file matching .
- Used before exclude to determine eligible files ( for instance , only " *.gif " )
- For multiple patterns , comma - separate them .
required : false
default : " * "
exclude :
description :
- Shell pattern - style file matching .
- Used after include to remove files ( for instance , skip " *.txt " )
- For multiple patterns , comma - separate them .
required : false
default : " .* "
author : tedder
extends_documentation_fragment :
- aws
- ec2
'''
EXAMPLES = '''
- name : basic upload
s3_sync :
bucket : tedder
file_root : roles / s3 / files /
- name : all the options
s3_sync :
bucket : tedder
file_root : roles / s3 / files
mime_map :
. yml : application / text
. json : application / text
key_prefix : config_files / web
file_change_strategy : force
permission : public - read
include : " * "
exclude : " *.txt,.* "
'''
RETURN = '''
filelist_initial :
description : file listing ( dicts ) from inital globbing
returned : always
type : list
sample : [ {
" bytes " : 151 ,
" chopped_path " : " policy.json " ,
" fullpath " : " roles/cf/files/policy.json " ,
" modified_epoch " : 1477416706
} ]
filelist_local_etag :
description : file listing ( dicts ) including calculated local etag
returned : always
type : list
sample : [ {
" bytes " : 151 ,
" chopped_path " : " policy.json " ,
" fullpath " : " roles/cf/files/policy.json " ,
" mime_type " : " application/json " ,
" modified_epoch " : 1477416706 ,
" s3_path " : " s3sync/policy.json "
} ]
filelist_s3 :
description : file listing ( dicts ) including information about previously - uploaded versions
returned : always
type : list
sample : [ {
" bytes " : 151 ,
" chopped_path " : " policy.json " ,
" fullpath " : " roles/cf/files/policy.json " ,
" mime_type " : " application/json " ,
" modified_epoch " : 1477416706 ,
" s3_path " : " s3sync/policy.json "
} ]
filelist_typed :
description : file listing ( dicts ) with calculated or overridden mime types
returned : always
type : list
sample : [ {
" bytes " : 151 ,
" chopped_path " : " policy.json " ,
" fullpath " : " roles/cf/files/policy.json " ,
" mime_type " : " application/json " ,
" modified_epoch " : 1477416706
} ]
filelist_actionable :
description : file listing ( dicts ) of files that will be uploaded after the strategy decision
returned : always
type : list
sample : [ {
" bytes " : 151 ,
" chopped_path " : " policy.json " ,
" fullpath " : " roles/cf/files/policy.json " ,
" mime_type " : " application/json " ,
" modified_epoch " : 1477931256 ,
" s3_path " : " s3sync/policy.json " ,
" whysize " : " 151 / 151 " ,
" whytime " : " 1477931256 / 1477929260 "
} ]
uploaded :
description : file listing ( dicts ) of files that were actually uploaded
returned : always
type : list
sample : [ {
" bytes " : 151 ,
" chopped_path " : " policy.json " ,
" fullpath " : " roles/cf/files/policy.json " ,
" s3_path " : " s3sync/policy.json " ,
" whysize " : " 151 / 151 " ,
" whytime " : " 1477931637 / 1477931489 "
} ]
'''
import os
import stat as osstat # os.stat constants
import mimetypes
import datetime
from dateutil import tz
import hashlib
import fnmatch
# import module snippets
from ansible . module_utils . basic import AnsibleModule
from ansible . module_utils . ec2 import ec2_argument_spec
# import a class, otherwise we'll use a fully qualified path
#from ansible.module_utils.ec2 import AWSRetry
import ansible . module_utils . ec2
try :
import botocore
HAS_BOTO3 = True
except ImportError :
HAS_BOTO3 = False
def boto_exception ( err ) :
''' generic error message handler '''
if hasattr ( err , ' error_message ' ) :
error = err . error_message
elif hasattr ( err , ' message ' ) :
error = str ( err . message ) + ' ' + str ( err ) + ' - ' + str ( type ( err ) )
else :
error = ' %s : %s ' % ( Exception , err )
return error
# the following function, calculate_multipart_etag, is from tlastowka
# on github and is used under its (compatible) GPL license. So this
# license applies to the following function.
# source: https://github.com/tlastowka/calculate_multipart_etag/blob/master/calculate_multipart_etag.py
#
# calculate_multipart_etag Copyright (C) 2015
# Tony Lastowka <tlastowka at gmail dot com>
# https://github.com/tlastowka
#
#
# calculate_multipart_etag is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# calculate_multipart_etag is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with calculate_multipart_etag. If not, see <http://www.gnu.org/licenses/>.
DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
def calculate_multipart_etag ( source_path , chunk_size = DEFAULT_CHUNK_SIZE ) :
"""
calculates a multipart upload etag for amazon s3
Arguments :
source_path - - The file to calculate the etag for
chunk_size - - The chunk size to calculate for .
"""
md5s = [ ]
with open ( source_path , ' rb ' ) as fp :
while True :
data = fp . read ( chunk_size )
if not data :
break
md5s . append ( hashlib . md5 ( data ) )
if len ( md5s ) == 1 :
new_etag = ' " {} " ' . format ( md5s [ 0 ] . hexdigest ( ) )
else : # > 1
digests = b " " . join ( m . digest ( ) for m in md5s )
new_md5 = hashlib . md5 ( digests )
new_etag = ' " {} - {} " ' . format ( new_md5 . hexdigest ( ) , len ( md5s ) )
return new_etag
def gather_files ( fileroot , include = None , exclude = None ) :
ret = [ ]
for ( dirpath , dirnames , filenames ) in os . walk ( fileroot ) :
for fn in filenames :
fullpath = os . path . join ( dirpath , fn )
# include/exclude
if include :
found = False
for x in include . split ( ' , ' ) :
2017-01-30 23:01:47 +00:00
if fnmatch . fnmatch ( fn , x ) :
found = True
2016-10-31 17:14:34 +00:00
if not found :
# not on the include list, so we don't want it.
continue
if exclude :
found = False
for x in exclude . split ( ' , ' ) :
if fnmatch . fnmatch ( fn , x ) :
found = True
if found :
# skip it, even if previously included.
continue
chopped_path = os . path . relpath ( fullpath , start = fileroot )
fstat = os . stat ( fullpath )
f_size = fstat [ osstat . ST_SIZE ]
f_modified_epoch = fstat [ osstat . ST_MTIME ]
ret . append ( {
' fullpath ' : fullpath ,
' chopped_path ' : chopped_path ,
' modified_epoch ' : f_modified_epoch ,
' bytes ' : f_size
2017-01-29 07:28:53 +00:00
} )
2016-10-31 17:14:34 +00:00
# dirpath = path *to* the directory
# dirnames = subdirs *in* our directory
# filenames
return ret
def calculate_s3_path ( filelist , key_prefix = ' ' ) :
ret = [ ]
for fileentry in filelist :
# don't modify the input dict
retentry = fileentry . copy ( )
retentry [ ' s3_path ' ] = os . path . join ( key_prefix , fileentry [ ' chopped_path ' ] )
ret . append ( retentry )
return ret
def calculate_local_etag ( filelist , key_prefix = ' ' ) :
''' Really, " calculate md5 " , but since AWS uses their own format, we ' ll just call
it a " local etag " . TODO optimization : only calculate if remote key exists . '''
ret = [ ]
for fileentry in filelist :
# don't modify the input dict
retentry = fileentry . copy ( )
retentry [ ' local_etag ' ] = calculate_multipart_etag ( fileentry [ ' fullpath ' ] )
ret . append ( retentry )
return ret
def determine_mimetypes ( filelist , override_map ) :
ret = [ ]
for fileentry in filelist :
retentry = fileentry . copy ( )
localfile = fileentry [ ' fullpath ' ]
# reminder: file extension is '.txt', not 'txt'.
_ , file_extension = os . path . splitext ( localfile )
if override_map and override_map . get ( file_extension ) :
# override? use it.
retentry [ ' mime_type ' ] = override_map [ file_extension ]
else :
# else sniff it
retentry [ ' mime_type ' ] , retentry [ ' encoding ' ] = mimetypes . guess_type ( localfile , strict = False )
# might be None or '' from one of the above. Not a great type but better than nothing.
if not retentry [ ' mime_type ' ] :
retentry [ ' mime_type ' ] = ' application/octet-stream '
ret . append ( retentry )
return ret
def head_s3 ( s3 , bucket , s3keys ) :
retkeys = [ ]
for entry in s3keys :
retentry = entry . copy ( )
# don't modify the input dict
try :
retentry [ ' s3_head ' ] = s3 . head_object ( Bucket = bucket , Key = entry [ ' s3_path ' ] )
except botocore . exceptions . ClientError as err :
if hasattr ( err , ' response ' ) and ' ResponseMetadata ' in err . response and ' HTTPStatusCode ' in err . response [ ' ResponseMetadata ' ] and str ( err . response [ ' ResponseMetadata ' ] [ ' HTTPStatusCode ' ] ) == ' 404 ' :
pass
else :
raise Exception ( err )
#error_msg = boto_exception(err)
#return {'error': error_msg}
retkeys . append ( retentry )
return retkeys
def filter_list ( s3 , bucket , s3filelist , strategy ) :
keeplist = list ( s3filelist )
for e in keeplist :
2017-01-30 23:01:47 +00:00
e [ ' _strategy ' ] = strategy
2016-10-31 17:14:34 +00:00
# init/fetch info from S3 if we're going to use it for comparisons
if not strategy == ' force ' :
keeplist = head_s3 ( s3 , bucket , s3filelist )
# now actually run the strategies
if strategy == ' checksum ' :
for entry in keeplist :
if entry . get ( ' s3_head ' ) :
# since we have a remote s3 object, compare the values.
if entry [ ' s3_head ' ] [ ' ETag ' ] == entry [ ' local_etag ' ] :
# files match, so remove the entry
entry [ ' skip_flag ' ] = True
else :
# file etags don't match, keep the entry.
pass
else : # we don't have an etag, so we'll keep it.
pass
elif strategy == ' date_size ' :
for entry in keeplist :
if entry . get ( ' s3_head ' ) :
#fstat = entry['stat']
local_modified_epoch = entry [ ' modified_epoch ' ]
local_size = entry [ ' bytes ' ]
# py2's datetime doesn't have a timestamp() field, so we have to revert to something more awkward.
#remote_modified_epoch = entry['s3_head']['LastModified'].timestamp()
remote_modified_datetime = entry [ ' s3_head ' ] [ ' LastModified ' ]
delta = ( remote_modified_datetime - datetime . datetime ( 1970 , 1 , 1 , tzinfo = tz . tzutc ( ) ) )
remote_modified_epoch = delta . seconds + ( delta . days * 86400 )
remote_size = entry [ ' s3_head ' ] [ ' ContentLength ' ]
entry [ ' whytime ' ] = ' {} / {} ' . format ( local_modified_epoch , remote_modified_epoch )
entry [ ' whysize ' ] = ' {} / {} ' . format ( local_size , remote_size )
if local_modified_epoch < = remote_modified_epoch or local_size == remote_size :
entry [ ' skip_flag ' ] = True
else :
entry [ ' why ' ] = " no s3_head "
# else: probably 'force'. Basically we don't skip with any with other strategies.
else :
pass
# prune 'please skip' entries, if any.
return [ x for x in keeplist if not x . get ( ' skip_flag ' ) ]
def upload_files ( s3 , bucket , filelist , params ) :
ret = [ ]
for entry in filelist :
args = {
2017-01-29 07:28:53 +00:00
' ContentType ' : entry [ ' mime_type ' ]
2016-10-31 17:14:34 +00:00
}
if params . get ( ' permission ' ) :
args [ ' ACL ' ] = params [ ' permission ' ]
2017-01-19 20:14:05 +00:00
s3 . upload_file ( entry [ ' fullpath ' ] , bucket , entry [ ' s3_path ' ] , ExtraArgs = args , Callback = None , Config = None )
2016-10-31 17:14:34 +00:00
ret . append ( entry )
return ret
def main ( ) :
argument_spec = ec2_argument_spec ( )
argument_spec . update ( dict (
2017-01-29 07:28:53 +00:00
mode = dict ( choices = [ ' push ' ] , default = ' push ' ) ,
file_change_strategy = dict ( choices = [ ' force ' , ' date_size ' , ' checksum ' ] , default = ' date_size ' ) ,
bucket = dict ( required = True ) ,
key_prefix = dict ( required = False , default = ' ' ) ,
file_root = dict ( required = True , type = ' path ' ) ,
permission = dict ( required = False , choices = [ ' private ' , ' public-read ' , ' public-read-write ' , ' authenticated-read ' , ' aws-exec-read ' , ' bucket-owner-read ' , ' bucket-owner-full-control ' ] ) ,
retries = dict ( required = False ) ,
mime_map = dict ( required = False , type = ' dict ' ) ,
exclude = dict ( required = False , default = " .* " ) ,
include = dict ( required = False , default = " * " ) ,
# future options: cache_control (string or map, perhaps), encoding, metadata, storage_class, retries
)
2016-10-31 17:14:34 +00:00
)
module = AnsibleModule (
argument_spec = argument_spec ,
)
if not HAS_BOTO3 :
module . fail_json ( msg = ' boto3 required for this module ' )
result = { }
mode = module . params [ ' mode ' ]
try :
region , ec2_url , aws_connect_kwargs = ansible . module_utils . ec2 . get_aws_connection_info ( module , boto3 = True )
s3 = ansible . module_utils . ec2 . boto3_conn ( module , conn_type = ' client ' , resource = ' s3 ' , region = region , endpoint = ec2_url , * * aws_connect_kwargs )
s3 . list_buckets ( )
except botocore . exceptions . NoCredentialsError as e :
module . fail_json ( msg = str ( e ) )
if mode == ' push ' :
try :
result [ ' filelist_initial ' ] = gather_files ( module . params [ ' file_root ' ] , exclude = module . params [ ' exclude ' ] , include = module . params [ ' include ' ] )
result [ ' filelist_typed ' ] = determine_mimetypes ( result [ ' filelist_initial ' ] , module . params . get ( ' mime_map ' ) )
result [ ' filelist_s3 ' ] = calculate_s3_path ( result [ ' filelist_typed ' ] , module . params [ ' key_prefix ' ] )
result [ ' filelist_local_etag ' ] = calculate_local_etag ( result [ ' filelist_s3 ' ] )
result [ ' filelist_actionable ' ] = filter_list ( s3 , module . params [ ' bucket ' ] , result [ ' filelist_local_etag ' ] , module . params [ ' file_change_strategy ' ] )
result [ ' uploads ' ] = upload_files ( s3 , module . params [ ' bucket ' ] , result [ ' filelist_actionable ' ] , module . params )
# mark changed if we actually upload something.
if result . get ( ' uploads ' ) and len ( result . get ( ' uploads ' ) ) :
result [ ' changed ' ] = True
#result.update(filelist=actionable_filelist)
except Exception as err :
error_msg = boto_exception ( err )
import traceback # traces get swallowed by Ansible.
module . fail_json ( msg = error_msg , traceback = traceback . format_exc ( ) . splitlines ( ) )
module . exit_json ( * * result )
if __name__ == ' __main__ ' :
main ( )