-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-49689: [R][C++] Parquets do not support list-columns of ordered factors (ordered dictionaries) #49937
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
GH-49689: [R][C++] Parquets do not support list-columns of ordered factors (ordered dictionaries) #49937
Changes from all commits
d18517b
3d48f84
97135f5
0a21ada
bcc614b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1761,4 +1761,81 @@ TEST(TestDictionaryUnifier, TableZeroColumns) { | |
| AssertTablesEqual(*table, *unified); | ||
| } | ||
|
|
||
| // GH-49689: Ordered dictionary tests | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) { | ||
| auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder)); | ||
|
|
||
| auto builder_type = boxed_builder->type(); | ||
| ASSERT_TRUE(checked_cast<const DictionaryType&>(*builder_type).ordered()); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, UnorderedTypeStaysUnordered) { | ||
| auto unordered_type = dictionary(int8(), utf8(), /*ordered=*/false); | ||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), unordered_type, &boxed_builder)); | ||
|
|
||
| auto builder_type = boxed_builder->type(); | ||
| ASSERT_FALSE(checked_cast<const DictionaryType&>(*builder_type).ordered()); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) { | ||
| auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could have a |
||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder)); | ||
| auto& builder = checked_cast<DictionaryBuilder<StringType>&>(*boxed_builder); | ||
|
|
||
| ASSERT_OK(builder.Append("a")); | ||
| ASSERT_OK(builder.Append("b")); | ||
| ASSERT_OK(builder.Append("a")); | ||
|
|
||
| std::shared_ptr<Array> result; | ||
| ASSERT_OK(builder.Finish(&result)); | ||
|
|
||
| const auto& result_type = checked_cast<const DictionaryType&>(*result->type()); | ||
| ASSERT_TRUE(result_type.ordered()); | ||
|
|
||
| auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])"); | ||
| auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]"); | ||
| DictionaryArray expected(ordered_type, ex_indices, ex_dict); | ||
| AssertArraysEqual(expected, *result); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) { | ||
| auto ordered_dict_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. |
||
| auto list_type = list(field("item", ordered_dict_type)); | ||
|
|
||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), list_type, &boxed_builder)); | ||
| auto& list_builder = checked_cast<ListBuilder&>(*boxed_builder); | ||
| auto& dict_builder = | ||
| checked_cast<DictionaryBuilder<StringType>&>(*list_builder.value_builder()); | ||
|
|
||
| ASSERT_OK(list_builder.Append()); | ||
| ASSERT_OK(dict_builder.Append("a")); | ||
| ASSERT_OK(dict_builder.Append("b")); | ||
| ASSERT_OK(list_builder.Append()); | ||
| ASSERT_OK(dict_builder.Append("a")); | ||
|
|
||
| std::shared_ptr<Array> result; | ||
| ASSERT_OK(list_builder.Finish(&result)); | ||
|
|
||
| const auto& result_list_type = checked_cast<const ListType&>(*result->type()); | ||
| const auto& result_dict_type = | ||
| checked_cast<const DictionaryType&>(*result_list_type.value_type()); | ||
| ASSERT_TRUE(result_dict_type.ordered()); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) { | ||
| auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here? |
||
| std::unique_ptr<ArrayBuilder> builder; | ||
| ASSERT_OK(MakeDictionaryBuilder(default_memory_pool(), ordered_type, | ||
| /*dictionary=*/nullptr, &builder)); | ||
|
|
||
| auto builder_type = builder->type(); | ||
| ASSERT_TRUE(checked_cast<const DictionaryType&>(*builder_type).ordered()); | ||
| } | ||
|
|
||
|
Comment on lines
+1764
to
+1840
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is all AI-generated. I roughly understood what it does but am not confident in it, other than having checked it looks relatively idiomatic, and all of these tests except |
||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,7 +160,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(-1), | ||
| indices_builder_(start_int_size, pool, alignment), | ||
| value_type_(value_type) {} | ||
| value_type_(value_type), | ||
| ordered_(false) {} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
|
|
||
| template <typename T1 = T> | ||
| explicit DictionaryBuilderBase( | ||
|
|
@@ -173,7 +174,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(-1), | ||
| indices_builder_(pool, alignment), | ||
| value_type_(value_type) {} | ||
| value_type_(value_type), | ||
| ordered_(false) {} | ||
|
|
||
| template <typename T1 = T> | ||
| explicit DictionaryBuilderBase( | ||
|
|
@@ -187,7 +189,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(-1), | ||
| indices_builder_(index_type, pool, alignment), | ||
| value_type_(value_type) {} | ||
| value_type_(value_type), | ||
| ordered_(false) {} | ||
|
|
||
| template <typename B = BuilderType, typename T1 = T> | ||
| DictionaryBuilderBase(uint8_t start_int_size, | ||
|
|
@@ -202,7 +205,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(static_cast<const T1&>(*value_type).byte_width()), | ||
| indices_builder_(start_int_size, pool, alignment), | ||
| value_type_(value_type) {} | ||
| value_type_(value_type), | ||
| ordered_(false) {} | ||
|
|
||
| template <typename T1 = T> | ||
| explicit DictionaryBuilderBase( | ||
|
|
@@ -214,7 +218,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(static_cast<const T1&>(*value_type).byte_width()), | ||
| indices_builder_(pool, alignment), | ||
| value_type_(value_type) {} | ||
| value_type_(value_type), | ||
| ordered_(false) {} | ||
|
|
||
| template <typename T1 = T> | ||
| explicit DictionaryBuilderBase( | ||
|
|
@@ -227,7 +232,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(static_cast<const T1&>(*value_type).byte_width()), | ||
| indices_builder_(index_type, pool, alignment), | ||
| value_type_(value_type) {} | ||
| value_type_(value_type), | ||
| ordered_(false) {} | ||
|
|
||
| template <typename T1 = T> | ||
| explicit DictionaryBuilderBase( | ||
|
|
@@ -243,7 +249,8 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| delta_offset_(0), | ||
| byte_width_(-1), | ||
| indices_builder_(pool, alignment), | ||
| value_type_(dictionary->type()) {} | ||
| value_type_(dictionary->type()), | ||
| ordered_(false) {} | ||
|
|
||
| ~DictionaryBuilderBase() override = default; | ||
|
|
||
|
|
@@ -490,9 +497,11 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
| Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); } | ||
|
|
||
| std::shared_ptr<DataType> type() const override { | ||
| return ::arrow::dictionary(indices_builder_.type(), value_type_); | ||
| return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_); | ||
| } | ||
|
|
||
| void set_ordered(bool ordered) { ordered_ = ordered; } | ||
|
|
||
| protected: | ||
| template <typename c_type> | ||
| Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict, | ||
|
|
@@ -561,6 +570,7 @@ class DictionaryBuilderBase : public ArrayBuilder { | |
|
|
||
| BuilderType indices_builder_; | ||
| std::shared_ptr<DataType> value_type_; | ||
| bool ordered_; | ||
| }; | ||
|
|
||
| template <typename BuilderType> | ||
|
|
@@ -647,7 +657,7 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder { | |
|
|
||
| Status FinishInternal(std::shared_ptr<ArrayData>* out) override { | ||
| ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); | ||
| (*out)->type = dictionary((*out)->type, null()); | ||
| (*out)->type = dictionary((*out)->type, null(), ordered_); | ||
| (*out)->dictionary = NullArray(0).data(); | ||
| return Status::OK(); | ||
| } | ||
|
|
@@ -659,11 +669,14 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder { | |
| Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); } | ||
|
|
||
| std::shared_ptr<DataType> type() const override { | ||
| return ::arrow::dictionary(indices_builder_.type(), null()); | ||
| return ::arrow::dictionary(indices_builder_.type(), null(), ordered_); | ||
| } | ||
|
|
||
| void set_ordered(bool ordered) { ordered_ = ordered; } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of having this separate method, perhaps we can add a |
||
|
|
||
| protected: | ||
| BuilderType indices_builder_; | ||
| bool ordered_ = false; | ||
| }; | ||
|
|
||
| } // namespace internal | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -168,17 +168,24 @@ struct DictionaryBuilderCase { | |
| template <typename ValueType> | ||
| Status CreateFor() { | ||
| using AdaptiveBuilderType = DictionaryBuilder<ValueType>; | ||
| using ExactBuilderType = | ||
| internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>; | ||
| if (dictionary != nullptr) { | ||
| out->reset(new AdaptiveBuilderType(dictionary, pool)); | ||
| auto builder = new AdaptiveBuilderType(dictionary, pool); | ||
| builder->set_ordered(ordered); | ||
| out->reset(builder); | ||
| } else if (exact_index_type) { | ||
| if (!is_integer(index_type->id())) { | ||
| return Status::TypeError("MakeBuilder: invalid index type ", *index_type); | ||
| } | ||
| out->reset(new internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>( | ||
| index_type, value_type, pool)); | ||
| auto builder = new ExactBuilderType(index_type, value_type, pool); | ||
| builder->set_ordered(ordered); | ||
| out->reset(builder); | ||
| } else { | ||
| auto start_int_size = index_type->byte_width(); | ||
| out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool)); | ||
| auto builder = new AdaptiveBuilderType(start_int_size, value_type, pool); | ||
| builder->set_ordered(ordered); | ||
| out->reset(builder); | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
@@ -188,8 +195,9 @@ struct DictionaryBuilderCase { | |
| MemoryPool* pool; | ||
| const std::shared_ptr<DataType>& index_type; | ||
| const std::shared_ptr<DataType>& value_type; | ||
| const std::shared_ptr<Array>& dictionary; | ||
| std::shared_ptr<Array> dictionary; | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was updated as without it, we ended up with a dangling reference when |
||
| bool exact_index_type; | ||
| bool ordered; | ||
| std::unique_ptr<ArrayBuilder>* out; | ||
| }; | ||
|
|
||
|
|
@@ -206,6 +214,7 @@ struct MakeBuilderImpl { | |
| dict_type.value_type(), | ||
| /*dictionary=*/nullptr, | ||
| exact_index_type, | ||
| dict_type.ordered(), | ||
| &out}; | ||
| return visitor.Make(); | ||
| } | ||
|
|
@@ -332,9 +341,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& | |
| const std::shared_ptr<Array>& dictionary, | ||
| std::unique_ptr<ArrayBuilder>* out) { | ||
| const auto& dict_type = static_cast<const DictionaryType&>(*type); | ||
| DictionaryBuilderCase visitor = { | ||
| pool, dict_type.index_type(), dict_type.value_type(), | ||
| dictionary, /*exact_index_type=*/false, out}; | ||
| DictionaryBuilderCase visitor = {pool, | ||
| dict_type.index_type(), | ||
| dict_type.value_type(), | ||
| dictionary, | ||
| /*exact_index_type=*/false, | ||
| dict_type.ordered(), | ||
| out}; | ||
| return visitor.Make(); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These two tests can be factored into a single one by using a
forloop.