在 Python 中使用直接请求从 S3 下载
Downloading from S3 using a direct request in Python
给定
bucket = 'mybucket'
aws_id = '.....'
aws_secret_key = '........'
file_key = '/some/file/key'
range = '40-2000'
我想通过 Python 发送请求,以获取文件的相应部分。
我改编了 AWS docs 中的 EC2(第一个)示例 -
import sys, os, base64, datetime, hashlib, hmac
import requests # I would prefer urllib or anything that comes with Python
host = 's3.amazonaws.com'
region = 'us-east-1'
endpoint = 'https://s3.amazonaws.com'
# Amazon's key signing logic
def sign(key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def getSignatureKey(key, dateStamp, regionName, serviceName):
kDate = sign(('AWS4' + key).encode('utf-8'), dateStamp)
kRegion = sign(kDate, regionName)
kService = sign(kRegion, serviceName)
kSigning = sign(kService, 'aws4_request')
return kSigning
# Create a date for headers and the credential string
t = datetime.datetime.utcnow()
amzdate = t.strftime('%Y%m%dT%H%M%SZ')
datestamp = t.strftime('%Y%m%d') # Date w/o time, used in credential scope
# Create a signing key from the aws_secret_key
signing_key = getSignatureKey(aws_secret_key, datestamp, region, 's3')
# Generate request parts - What canonical_querystring is required for downloading (part of) a file?
canonical_querystring = '???' # This string is appended to the endpoint url when the request is sent
canonical_headers = 'host:' + host + '\n' + 'x-amz-date:' + amzdate + '\n'
signed_headers = 'host;x-amz-date'
payload_hash = hashlib.sha256(('').encode('utf-8')).hexdigest()
# Combine elements to create canonical request, that will be used to create an authorization header
canonical_request = 'GET' + '\n' + '/' + '\n' + canonical_querystring + '\n' + canonical_headers + '\n' + signed_headers + '\n' + payload_hash
# Create hashed version of the canonical request with yet more prefixes
algorithm = 'AWS4-HMAC-SHA256'
credential_scope = datestamp + '/' + region + '/' + 's3' + '/' + 'aws4_request'
string_to_sign = algorithm + '\n' + amzdate + '\n' + credential_scope + '\n' + hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()
# Sign the semi-hashed string from above using the signing key (the one we generated with our secret key)
signature = hmac.new(signing_key, (string_to_sign).encode('utf-8'), hashlib.sha256).hexdigest()
# Generate authorization header from the signature, access key and yet more prefixes
authorization_header = algorithm + ' ' + 'Credential=' + aws_access_key + '/' + credential_scope + ', ' + 'SignedHeaders=' + signed_headers + ', ' + 'Signature=' + signature
# Generate headers dictionary. Do I need more/other headers here for downloading a file?
headers = {'x-amz-date':amzdate, 'Authorization':authorization_header}
# The gods are shining upon us
r = requests.get(endpoint + '?' + canonical_querystring, headers=headers)
print('Response code: %d\n' % r.status_code)
# Wish to get my bytestring here
我仔细阅读了 boto3 的源代码,试图理解这种直接请求的机制,但无法集中精力使用请求/urllib 使代码片段发挥作用。
谁能指出完成改编所缺少的内容?
如果您只是想下载文件的内容以便在 python 中使用,这里是我的代码的简短版本。
import boto3
aws = boto3.session.Session(profile_name='maintenance')
s3 = aws.client('s3, region_name='us-west-2')
data = s3.get_object(
Bucket='my_bucket_name',
Key='/path/of/s3/key'
)['Body'].read()
现在您拥有了整个文件,您可以像处理其他代码一样处理它。
编辑:听起来您还没有凭据或任何设置。 boto3(和大多数亚马逊 CLI 产品)需要以下格式的凭据文件:
姓名:~/.aws/credentials
[default]
aws_secret_access_key = 9087OKLJHAFWSKLDJGHNAKLJHWR34K (random keys typed by me)
aws_access_key_id = MORERANDOMKEYSTOFILLTHESPACE
创建该文件,我想你会被设置。
请求是一个 http GET
。我用 requests-toolbelt dump tool 看看它的样子,这是需要的:
s3_message_parts = ['GET {} HTTP/1.1',
'Host: {}',
'Connection: keep-alive',
'Accept-Encoding: gzip, deflate',
'Accept: */*',
'User-Agent: ssup',
'X-Amz-Content-Sha256: UNSIGNED-PAYLOAD',
'Range: bytes={}-{}',
'X-Amz-Date: {}',
'Authorization: {}',
'\r\n']
2 个棘手的部分:
给定一个桶和一个密钥,找出要与之交互的主机/端点。
正确填写Authorization
header。
我没有解决 1,只是提供了我提前为我的存储桶找到的端点。
就 2 而言,我通过查看出色的 minio-py 库设法弄清楚了签名过程。
整个演习看起来像这样 (most recent version) :
import hashlib, hmac, socket, ssl
from datetime import datetime
try:
from urlparse import urlsplit
except:
from urllib.parse import urlsplit
ALGORTHM = 'AWS4-HMAC-SHA256'
sign = lambda key, msg: hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def sign_headers(headers, url, access_key, secret_key, region = 'us-east-1'):
method = 'GET'
# Get host and parsed datetime and date used by AWS
parsed_url = urlsplit(url)
host = parsed_url.netloc
date = datetime.utcnow()
aws_datetime = date.strftime("%Y%m%dT%H%M%SZ")
aws_date = date.strftime("%Y%m%d")
# Generate scope and scoped credential strings, and the signing key
scope = '/'.join([aws_date, region, 's3', 'aws4_request'])
credential = '/'.join([access_key, scope])
signing_key = sign(sign(sign(sign(('AWS4' + secret_key).encode('utf-8'), aws_date), region), 's3'), 'aws4_request')
# Fill up all headers except 'Authorization'
headers['Host'] = host
headers['X-Amz-Date'] = aws_datetime
headers['X-Amz-Content-Sha256'] = u'UNSIGNED-PAYLOAD'
# Format header keys and data for the upcoming AWS atrings
sorted_headers_string = ';'.join([header.lower().strip() for header in sorted(headers)])
canonical_header_list = [header.lower().strip() + ':' + str(headers[header]).strip() for header in sorted(headers)]
# Geenerate canonical request and string to be signed
prefix = [method, parsed_url.path, parsed_url.query]
suffix = ['', sorted_headers_string, u'UNSIGNED-PAYLOAD'] # '' to alow 2 '\n'
canonical_req = '\n'.join(prefix + canonical_header_list + suffix)
string_to_sign = '\n'.join([ALGORTHM, aws_datetime, scope, hashlib.sha256(canonical_req.encode('utf-8')).hexdigest()])
signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest()
# Finally generate the Authoization header with signing string_to_sign
headers['Authorization'] = ALGORTHM + ' Credential=' + credential + ', ' + 'SignedHeaders=' + sorted_headers_string + ', ' + 'Signature=' + signature
return headers
def download_s3_chunk(bucket, key, start, end, access_key, secret_key, endpoint = 'https://s3.amazonaws.com', region = 'us-east-1'):
''' Download part of an S3 stored file using vanilla Python '''
headers = {'Range': 'bytes={}-{}'.format(start, end), 'User-Agent': 'ssup'}
headers = sign_headers(headers, endpoint, access_key, secret_key)
# Raw message to send via socket
s3_message_parts = ['GET {} HTTP/1.1',
'Host: {}',
'Connection: keep-alive',
'Accept-Encoding: gzip, deflate',
'Accept: */*',
'User-Agent: ssup',
'X-Amz-Content-Sha256: UNSIGNED-PAYLOAD',
'Range: bytes={}-{}',
'X-Amz-Date: {}',
'Authorization: {}',
'\r\n']
message_params = '/' + bucket + '/' + key, headers['Host'], start, end, headers['X-Amz-Date'], headers['Authorization']
s3_download_message = '\r\n'.join(s3_message_parts).format(message_params)
s = ssl.wrap_socket(socket.socket())
s.connect(('s3.amazonaws.com', 443))
s.sendall(s3_download_message)
#Implement proper retrieval loop
return s.recv(), s.recv()
if __name__=='__main__':
# Adjust to get arguments from command prompt
from sys import argv as args
# Credentials
access_key = 'access'
secret_key = 'secret'
# Bucket, key and location info
bucket = 'my_bucket'
key = 'my_key'
# Chunk of key to download
start = 20
end = 100
header, chunk = download_s3_chunk(bucket, key, start, end, access_key, secret_key)
只要存在 sha256 和 hmac 实现,逻辑就非常可移植。希望这会派上用场。
给定
bucket = 'mybucket'
aws_id = '.....'
aws_secret_key = '........'
file_key = '/some/file/key'
range = '40-2000'
我想通过 Python 发送请求,以获取文件的相应部分。
我改编了 AWS docs 中的 EC2(第一个)示例 -
import sys, os, base64, datetime, hashlib, hmac
import requests # I would prefer urllib or anything that comes with Python
host = 's3.amazonaws.com'
region = 'us-east-1'
endpoint = 'https://s3.amazonaws.com'
# Amazon's key signing logic
def sign(key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def getSignatureKey(key, dateStamp, regionName, serviceName):
kDate = sign(('AWS4' + key).encode('utf-8'), dateStamp)
kRegion = sign(kDate, regionName)
kService = sign(kRegion, serviceName)
kSigning = sign(kService, 'aws4_request')
return kSigning
# Create a date for headers and the credential string
t = datetime.datetime.utcnow()
amzdate = t.strftime('%Y%m%dT%H%M%SZ')
datestamp = t.strftime('%Y%m%d') # Date w/o time, used in credential scope
# Create a signing key from the aws_secret_key
signing_key = getSignatureKey(aws_secret_key, datestamp, region, 's3')
# Generate request parts - What canonical_querystring is required for downloading (part of) a file?
canonical_querystring = '???' # This string is appended to the endpoint url when the request is sent
canonical_headers = 'host:' + host + '\n' + 'x-amz-date:' + amzdate + '\n'
signed_headers = 'host;x-amz-date'
payload_hash = hashlib.sha256(('').encode('utf-8')).hexdigest()
# Combine elements to create canonical request, that will be used to create an authorization header
canonical_request = 'GET' + '\n' + '/' + '\n' + canonical_querystring + '\n' + canonical_headers + '\n' + signed_headers + '\n' + payload_hash
# Create hashed version of the canonical request with yet more prefixes
algorithm = 'AWS4-HMAC-SHA256'
credential_scope = datestamp + '/' + region + '/' + 's3' + '/' + 'aws4_request'
string_to_sign = algorithm + '\n' + amzdate + '\n' + credential_scope + '\n' + hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()
# Sign the semi-hashed string from above using the signing key (the one we generated with our secret key)
signature = hmac.new(signing_key, (string_to_sign).encode('utf-8'), hashlib.sha256).hexdigest()
# Generate authorization header from the signature, access key and yet more prefixes
authorization_header = algorithm + ' ' + 'Credential=' + aws_access_key + '/' + credential_scope + ', ' + 'SignedHeaders=' + signed_headers + ', ' + 'Signature=' + signature
# Generate headers dictionary. Do I need more/other headers here for downloading a file?
headers = {'x-amz-date':amzdate, 'Authorization':authorization_header}
# The gods are shining upon us
r = requests.get(endpoint + '?' + canonical_querystring, headers=headers)
print('Response code: %d\n' % r.status_code)
# Wish to get my bytestring here
我仔细阅读了 boto3 的源代码,试图理解这种直接请求的机制,但无法集中精力使用请求/urllib 使代码片段发挥作用。
谁能指出完成改编所缺少的内容?
如果您只是想下载文件的内容以便在 python 中使用,这里是我的代码的简短版本。
import boto3
aws = boto3.session.Session(profile_name='maintenance')
s3 = aws.client('s3, region_name='us-west-2')
data = s3.get_object(
Bucket='my_bucket_name',
Key='/path/of/s3/key'
)['Body'].read()
现在您拥有了整个文件,您可以像处理其他代码一样处理它。
编辑:听起来您还没有凭据或任何设置。 boto3(和大多数亚马逊 CLI 产品)需要以下格式的凭据文件:
姓名:~/.aws/credentials
[default]
aws_secret_access_key = 9087OKLJHAFWSKLDJGHNAKLJHWR34K (random keys typed by me)
aws_access_key_id = MORERANDOMKEYSTOFILLTHESPACE
创建该文件,我想你会被设置。
请求是一个 http GET
。我用 requests-toolbelt dump tool 看看它的样子,这是需要的:
s3_message_parts = ['GET {} HTTP/1.1',
'Host: {}',
'Connection: keep-alive',
'Accept-Encoding: gzip, deflate',
'Accept: */*',
'User-Agent: ssup',
'X-Amz-Content-Sha256: UNSIGNED-PAYLOAD',
'Range: bytes={}-{}',
'X-Amz-Date: {}',
'Authorization: {}',
'\r\n']
2 个棘手的部分:
给定一个桶和一个密钥,找出要与之交互的主机/端点。
正确填写
Authorization
header。
我没有解决 1,只是提供了我提前为我的存储桶找到的端点。
就 2 而言,我通过查看出色的 minio-py 库设法弄清楚了签名过程。
整个演习看起来像这样 (most recent version) :
import hashlib, hmac, socket, ssl
from datetime import datetime
try:
from urlparse import urlsplit
except:
from urllib.parse import urlsplit
ALGORTHM = 'AWS4-HMAC-SHA256'
sign = lambda key, msg: hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def sign_headers(headers, url, access_key, secret_key, region = 'us-east-1'):
method = 'GET'
# Get host and parsed datetime and date used by AWS
parsed_url = urlsplit(url)
host = parsed_url.netloc
date = datetime.utcnow()
aws_datetime = date.strftime("%Y%m%dT%H%M%SZ")
aws_date = date.strftime("%Y%m%d")
# Generate scope and scoped credential strings, and the signing key
scope = '/'.join([aws_date, region, 's3', 'aws4_request'])
credential = '/'.join([access_key, scope])
signing_key = sign(sign(sign(sign(('AWS4' + secret_key).encode('utf-8'), aws_date), region), 's3'), 'aws4_request')
# Fill up all headers except 'Authorization'
headers['Host'] = host
headers['X-Amz-Date'] = aws_datetime
headers['X-Amz-Content-Sha256'] = u'UNSIGNED-PAYLOAD'
# Format header keys and data for the upcoming AWS atrings
sorted_headers_string = ';'.join([header.lower().strip() for header in sorted(headers)])
canonical_header_list = [header.lower().strip() + ':' + str(headers[header]).strip() for header in sorted(headers)]
# Geenerate canonical request and string to be signed
prefix = [method, parsed_url.path, parsed_url.query]
suffix = ['', sorted_headers_string, u'UNSIGNED-PAYLOAD'] # '' to alow 2 '\n'
canonical_req = '\n'.join(prefix + canonical_header_list + suffix)
string_to_sign = '\n'.join([ALGORTHM, aws_datetime, scope, hashlib.sha256(canonical_req.encode('utf-8')).hexdigest()])
signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest()
# Finally generate the Authoization header with signing string_to_sign
headers['Authorization'] = ALGORTHM + ' Credential=' + credential + ', ' + 'SignedHeaders=' + sorted_headers_string + ', ' + 'Signature=' + signature
return headers
def download_s3_chunk(bucket, key, start, end, access_key, secret_key, endpoint = 'https://s3.amazonaws.com', region = 'us-east-1'):
''' Download part of an S3 stored file using vanilla Python '''
headers = {'Range': 'bytes={}-{}'.format(start, end), 'User-Agent': 'ssup'}
headers = sign_headers(headers, endpoint, access_key, secret_key)
# Raw message to send via socket
s3_message_parts = ['GET {} HTTP/1.1',
'Host: {}',
'Connection: keep-alive',
'Accept-Encoding: gzip, deflate',
'Accept: */*',
'User-Agent: ssup',
'X-Amz-Content-Sha256: UNSIGNED-PAYLOAD',
'Range: bytes={}-{}',
'X-Amz-Date: {}',
'Authorization: {}',
'\r\n']
message_params = '/' + bucket + '/' + key, headers['Host'], start, end, headers['X-Amz-Date'], headers['Authorization']
s3_download_message = '\r\n'.join(s3_message_parts).format(message_params)
s = ssl.wrap_socket(socket.socket())
s.connect(('s3.amazonaws.com', 443))
s.sendall(s3_download_message)
#Implement proper retrieval loop
return s.recv(), s.recv()
if __name__=='__main__':
# Adjust to get arguments from command prompt
from sys import argv as args
# Credentials
access_key = 'access'
secret_key = 'secret'
# Bucket, key and location info
bucket = 'my_bucket'
key = 'my_key'
# Chunk of key to download
start = 20
end = 100
header, chunk = download_s3_chunk(bucket, key, start, end, access_key, secret_key)
只要存在 sha256 和 hmac 实现,逻辑就非常可移植。希望这会派上用场。