forked from ClickHouse/ClickHouse
-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathUtils.cpp
More file actions
110 lines (92 loc) · 3.18 KB
/
Utils.cpp
File metadata and controls
110 lines (92 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include <typeinfo>
#include "config.h"
#if USE_AVRO
#include <Processors/Formats/Impl/AvroRowInputFormat.h>
#include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
#include <filesystem>
using namespace DB;
#include <Columns/IColumn.h>
namespace DB::ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace Iceberg
{
using namespace DB;
// This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files.
// For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro
// Common path should end with "<table_name>" or "<table_name>/".
std::string getProperFilePathFromMetadataInfo(
std::string_view data_path,
std::string_view common_path,
std::string_view table_location,
std::string_view common_namespace)
{
auto trim_backward_slash = [](std::string_view str) -> std::string_view
{
if (str.ends_with('/'))
{
return str.substr(0, str.size() - 1);
}
return str;
};
auto trim_forward_slash = [](std::string_view str) -> std::string_view
{
if (str.starts_with('/'))
{
return str.substr(1);
}
return str;
};
common_path = trim_backward_slash(common_path);
table_location = trim_backward_slash(table_location);
if (data_path.starts_with(table_location) && table_location.ends_with(common_path))
{
return std::filesystem::path{common_path} / trim_forward_slash(data_path.substr(table_location.size()));
}
auto pos = data_path.find(common_path);
size_t good_pos = std::string::npos;
while (pos != std::string::npos)
{
auto potential_position = pos + common_path.size();
if ((std::string_view(data_path.data() + potential_position, 6) == "/data/")
|| (std::string_view(data_path.data() + potential_position, 10) == "/metadata/"))
{
good_pos = pos;
break;
}
size_t new_pos = data_path.find(common_path, pos + 1);
if (new_pos == std::string::npos)
{
break;
}
pos = new_pos;
}
if (good_pos != std::string::npos)
{
return std::string{data_path.substr(good_pos)};
}
else if (pos != std::string::npos)
{
return std::string{data_path.substr(pos)};
}
else
{
/// Data files can have different path
pos = data_path.find("://");
if (pos == std::string::npos)
throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unexpected data path: '{}'", data_path);
pos = data_path.find("/", pos + 3);
if (pos == std::string::npos)
throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unexpected data path: '{}'", data_path);
if (data_path.substr(pos + 1).starts_with(common_namespace))
{
auto new_pos = data_path.find("/", pos + 1);
if (new_pos - pos == common_namespace.length() + 1) /// bucket in the path
pos = new_pos;
}
return std::string(data_path.substr(pos));
}
}
}
#endif