Skip to content

Commit bcf1266

Browse files
authored
Merge pull request #700 from Altinity/fix_s3_path_style_no_key_parsing
Fix incorrect S3 uri parsing when key is not specified on path style
2 parents f35cf13 + a660dab commit bcf1266

3 files changed

Lines changed: 77 additions & 3 deletions

File tree

src/IO/S3/URI.cpp

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
4747
/// Case when bucket name and key represented in the path of S3 URL.
4848
/// E.g. (https://s3.region.amazonaws.com/bucket-name/key)
4949
/// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access
50-
static const RE2 path_style_pattern("^/([^/]*)/(.*)");
50+
static const RE2 path_style_pattern("^/([^/]*)(?:/?(.*))");
5151

5252
if (allow_archive_path_syntax)
5353
std::tie(uri_str, archive_pattern) = getURIAndArchivePattern(uri_);
@@ -124,7 +124,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
124124
{
125125
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
126126
}
127-
validateBucket(bucket, uri);
128127

129128
if (!uri.getPath().empty())
130129
{
@@ -142,7 +141,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
142141
{
143142
is_virtual_hosted_style = false;
144143
endpoint = uri.getScheme() + "://" + uri.getAuthority();
145-
validateBucket(bucket, uri);
146144
}
147145
else
148146
{
@@ -155,6 +153,9 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
155153
if (!uri.getPath().empty())
156154
key = uri.getPath().substr(1);
157155
}
156+
157+
validateBucket(bucket, uri);
158+
validateKey(key, uri);
158159
}
159160

160161
void URI::addRegionToURI(const std::string &region)
@@ -175,6 +176,37 @@ void URI::validateBucket(const String & bucket, const Poco::URI & uri)
175176
!uri.empty() ? " (" + uri.toString() + ")" : "");
176177
}
177178

179+
void URI::validateKey(const String & key, const Poco::URI & uri)
180+
{
181+
auto onError = [&]()
182+
{
183+
throw Exception(
184+
ErrorCodes::BAD_ARGUMENTS,
185+
"Invalid S3 key: {}{}",
186+
quoteString(key),
187+
!uri.empty() ? " (" + uri.toString() + ")" : "");
188+
};
189+
190+
191+
// this shouldn't happen ever because the regex should not catch this
192+
if (key.size() == 1 && key[0] == '/')
193+
{
194+
onError();
195+
}
196+
197+
// the current regex impl allows something like "bucket-name/////".
198+
// bucket: bucket-name
199+
// key: ////
200+
// throw exception in case such thing is found
201+
for (size_t i = 1; i < key.size(); i++)
202+
{
203+
if (key[i - 1] == '/' && key[i] == '/')
204+
{
205+
onError();
206+
}
207+
}
208+
}
209+
178210
std::pair<std::string, std::optional<std::string>> URI::getURIAndArchivePattern(const std::string & source)
179211
{
180212
size_t pos = source.find("::");

src/IO/S3/URI.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct URI
4040
void addRegionToURI(const std::string & region);
4141

4242
static void validateBucket(const std::string & bucket, const Poco::URI & uri);
43+
static void validateKey(const std::string & key, const Poco::URI & uri);
4344

4445
private:
4546
std::pair<std::string, std::optional<std::string>> getURIAndArchivePattern(const std::string & source);

src/IO/S3/tests/gtest_s3_uri.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#include <gtest/gtest.h>
2+
3+
#include <IO/S3/URI.h>
4+
#include "config.h"
5+
6+
7+
#if USE_AWS_S3
8+
9+
TEST(IOTestS3URI, PathStyleNoKey)
10+
{
11+
using namespace DB;
12+
13+
auto uri_with_no_key_and_no_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name");
14+
15+
ASSERT_EQ(uri_with_no_key_and_no_slash.bucket, "bucket-name");
16+
ASSERT_EQ(uri_with_no_key_and_no_slash.key, "");
17+
18+
auto uri_with_no_key_and_with_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/");
19+
20+
ASSERT_EQ(uri_with_no_key_and_with_slash.bucket, "bucket-name");
21+
ASSERT_EQ(uri_with_no_key_and_with_slash.key, "");
22+
23+
ASSERT_ANY_THROW(S3::URI("https://s3.region.amazonaws.com/bucket-name//"));
24+
}
25+
26+
TEST(IOTestS3URI, PathStyleWithKey)
27+
{
28+
using namespace DB;
29+
30+
auto uri_with_no_key_and_no_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/key");
31+
32+
ASSERT_EQ(uri_with_no_key_and_no_slash.bucket, "bucket-name");
33+
ASSERT_EQ(uri_with_no_key_and_no_slash.key, "key");
34+
35+
auto uri_with_no_key_and_with_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/key/key/key/key");
36+
37+
ASSERT_EQ(uri_with_no_key_and_with_slash.bucket, "bucket-name");
38+
ASSERT_EQ(uri_with_no_key_and_with_slash.key, "key/key/key/key");
39+
}
40+
41+
#endif

0 commit comments

Comments
 (0)