Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 55 additions & 16 deletions datafusion/sql/src/expr/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,32 +542,70 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
// accept a WITHIN GROUP clause.
let supports_within_group = fm.supports_within_group_clause();

if !within_group.is_empty() && !supports_within_group {
// Built-in ordered-set aggregates must also support WITHIN GROUP
let is_builtin_ordered_set = matches!(
name.as_str(),
"percentile_cont"
| "quantile_cont"
| "approx_percentile_cont"
| "approx_percentile_cont_with_weight"
);

let supports_within_group =
supports_within_group || is_builtin_ordered_set;

let mut within_group = within_group;
let mut order_by = order_by;

if supports_within_group
&& within_group.is_empty()
&& !order_by.is_empty()
{
// Inline ORDER BY syntax:
// quantile_cont(value, percentile ORDER BY value)
if args.len() >= 2 {
args.remove(0);
arg_names.remove(0);
}

within_group = order_by;
order_by = vec![];
}

if !supports_within_group && !within_group.is_empty() {
return plan_err!(
"WITHIN GROUP is only supported for ordered-set aggregate functions"
);
}

// If the UDAF supports WITHIN GROUP, convert the ordering into
// sort expressions and prepend them as unnamed function args.
let order_by = if supports_within_group {
let (within_group_sorts, new_args, new_arg_names) = self
.extract_and_prepend_within_group_args(
let order_by: Vec<SortExpr> = if supports_within_group {
if !within_group.is_empty() {
// WITHIN GROUP syntax
let sorts = self.order_by_to_sort_expr(
within_group,
args,
arg_names,
schema,
planner_context,
false,
None,
)?;
args = new_args;
arg_names = new_arg_names;
within_group_sorts
} else {
let order_by = if !order_by.is_empty() {
order_by

if sorts.len() != 1 {
return plan_err!(
"Only a single ordering expression is permitted in WITHIN GROUP clause"
);
}

// Prepend ordered value expression to args
let value_expr = sorts[0].expr.clone();
arg_names = std::iter::once(None).chain(arg_names).collect();
args = std::iter::once(value_expr).chain(args).collect();

sorts
} else {
within_group
};
vec![]
}
} else {
// Normal aggregate behavior
self.order_by_to_sort_expr(
order_by,
schema,
Expand Down Expand Up @@ -867,6 +905,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
Ok((exprs, names))
}

#[expect(dead_code)]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this means the code is not used -- shouldn't we just remove it rather than making #[expect(dead)]?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I may be missing part of the intended scope, so I’d appreciate some clarification.

It looks like we may need broader test coverage here - not only for quantile_cont, but also for other aggregate functions and similar usage patterns, to better define the expected behavior.

Originally, this issue was about rejecting invalid ORDER BY usage inside aggregate argument lists earlier. My initial fix enforced that, but it introduced a breaking change, so I updated the implementation to support both syntaxes for quantile_cont. This works for the cases I’ve added tests for.

Regarding your example:

SELECT approx_percentile_cont_with_weight(1, 0.95 ORDER BY x)
FROM (VALUES (1), (2), (3)) AS t(x);

I didn’t consider this kind of usage across other aggregate functions when implementing the change, so I’m not fully sure what the expected behavior should be here.

Would you expect inline ORDER BY to be supported only for quantile_cont (and similar ordered-set aggregates), or should this be extended more generally to other aggregate functions as well?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you expect inline ORDER BY to be supported only for quantile_cont (and similar ordered-set aggregates), or should this be extended more generally to other aggregate functions as well?

I am not sure -- I basically expect that datafusion would follow other implementations rather than creating our own syntax. In the case where previous versions of DataFusion supported syntax that is not supported by other systems, I would expect that syntax to keep working, though we don't have to extend it to other functions that didn't previously support it

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, that helps clarify the direction.

I’ll keep the inline ORDER BY syntax only for cases where it was already supported. I’ll also add tests to ensure we preserve existing behavior while rejecting unsupported cases more clearly.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you -- I am sorry for the delay / back and forth on this -- but we have been bitten downstream in the past when stuff that used to work stopped working

fn extract_and_prepend_within_group_args(
&self,
within_group: Vec<OrderByExpr>,
Expand Down
30 changes: 30 additions & 0 deletions datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,36 @@ CREATE TABLE group_median_table_nullable (
( 'group1', 125, 32766, 2147483646, arrow_cast(9223372036854775806,'Int64'), 100, 101, 4294967294, arrow_cast(100,'UInt64'), 3.2, 5.5, arrow_cast('NAN','Float64'), 0.0004, 0.0004 ),
( 'group1', 127, 32767, 2147483647, arrow_cast(9223372036854775807,'Int64'), 255, 65535, 4294967295, 18446744073709551615, 2.2, 2.2, arrow_cast('NAN','Float64'), 0.0005, 0.0005 )

query R
select quantile_cont(col0, 0.75 order by col0)
from values (1, 3), (2, 2), (3, 1) t(col0, col1);
----
2.5

query R
select quantile_cont(col0, 0.75 order by col0 desc)
from values (1, 3), (2, 2), (3, 1) t(col0, col1);
----
1.5

query R
select quantile_cont(0.75 order by col0)
from values (1, 3), (2, 2), (3, 1) t(col0, col1);
----
2.5

query R
select quantile_cont(0.75 order by col0 desc)
from values (1, 3), (2, 2), (3, 1) t(col0, col1);
----
1.5

query R
select quantile_cont(0.75) within group (order by col0)
from values (1), (2), (3) t(col0);
----
2.5

#######
# Error tests
#######
Expand Down
Loading