D
Turns out there are two correct answers:
No index.
See https://dba.stackexchange.com/questions/308816/optimize-performance-of-nested-group-by#comment603363_308816 by https://dba.stackexchange.com/users/217655/andrew-sayer .
Covering index.
Unfortunately, they deleted their answer.
There, I will elaborate my findings on their behalf.
Benchmarks
I have refined my benchmarks from the original question.
The Python script looks like this:
#!/usr/bin/env python3
import random
import time
import sqlalchemy as sqla
random.seed(0)
engine = sqla.create_engine("sqlite:///playground.sqlite")
metadata = sqla.MetaData()
metadata.reflect(bind=engine)
def create_table(
table_name: str = "random_numbers",
):
table = sqla.Table(
table_name,
metadata,
sqla.Column("a", sqla.Integer),
sqla.Column("b", sqla.Integer),
sqla.Column("c", sqla.Integer),
sqla.Column("x", sqla.Float),
)
table.create(bind=engine)
def fill_table():
table = metadata.tables["random_numbers"]
PARTITION = 1000
for i in range(int(1E7) // PARTITION):
rows = [
{
"a": random.randrange(int(1E4)),
"b": random.randrange(int(1E5)),
"c": random.randrange(int(1E6)),
"x": random.random(),
} for _ in range(PARTITION)
]
with engine.connect() as conn:
conn.execute(table.insert().values(rows))
print((i + 1) * PARTITION)
def nested_queries():
table = metadata.tables["random_numbers"]
q_abc = sqla.select(
table.c.a,
table.c.b,
table.c.c,
sqla.func.max(table.c.x).label("x"),
).group_by(
table.c.a,
table.c.b,
table.c.c,
)
t_abc = q_abc.cte()
with engine.connect() as conn:
for row_it in conn.execute("EXPLAIN QUERY PLAN " + str(q_abc)):
print(row_it)
tic = time.time()
conn.execute(q_abc)
print(time.time() - tic)
q_ab = sqla.select(
t_abc.c.a,
t_abc.c.b,
sqla.func.min(t_abc.c.x).label("x"),
).group_by(
t_abc.c.a,
t_abc.c.b,
)
t_ab = q_ab.cte()
with engine.connect() as conn:
for row_it in conn.execute("EXPLAIN QUERY PLAN " + str(q_ab)):
print(row_it)
tic = time.time()
conn.execute(q_ab)
print(time.time() - tic)
q_a = sqla.select(
t_ab.c.a,
sqla.func.max(t_ab.c.x).label("x"),
).group_by(
t_ab.c.a,
)
t_a = q_a.cte()
with engine.connect() as conn:
for row_it in conn.execute("EXPLAIN QUERY PLAN " + str(q_a)):
print(row_it)
tic = time.time()
conn.execute(q_a)
print(time.time() - tic)
if name == "main":
if "random_numbers" not in metadata.tables:
create_table()
fill_table()
t = metadata.tables["random_numbers"]
with engine.connect() as conn:
conn.execute("ANALYZE")
nested_queries()
idx = sqla.Index(
"abc",
t.c.a,
t.c.b,
t.c.c,
)
idx.create(bind=engine)
with engine.connect() as conn:
conn.execute("ANALYZE")
nested_queries()
idx.drop(bind=engine)
idx = sqla.Index(
"abcx",
t.c.a,
t.c.b,
t.c.c,
t.c.x,
)
idx.create(bind=engine)
with engine.connect() as conn:
conn.execute("ANALYZE")
nested_queries()
idx.drop(bind=engine)
The printed results are:
(6, 0, 0, 'SCAN TABLE random_numbers')
(8, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
4.879260778427124
(2, 0, 0, 'CO-ROUTINE 1')
(8, 2, 0, 'SCAN TABLE random_numbers')
(10, 2, 0, 'USE TEMP B-TREE FOR GROUP BY')
(59, 0, 0, 'SCAN SUBQUERY 1')
(62, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
12.184722423553467
(2, 0, 0, 'CO-ROUTINE 2')
(4, 2, 0, 'CO-ROUTINE 1')
(10, 4, 0, 'SCAN TABLE random_numbers')
(12, 4, 0, 'USE TEMP B-TREE FOR GROUP BY')
(61, 2, 0, 'SCAN SUBQUERY 1')
(64, 2, 0, 'USE TEMP B-TREE FOR GROUP BY')
(108, 0, 0, 'SCAN SUBQUERY 2')
(111, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
16.619513034820557
(7, 0, 0, 'SCAN TABLE random_numbers USING INDEX abc')
0.0
(2, 0, 0, 'CO-ROUTINE 1')
(9, 2, 0, 'SCAN TABLE random_numbers USING INDEX abc')
(49, 0, 0, 'SCAN SUBQUERY 1')
(52, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
131.1691746711731
(2, 0, 0, 'CO-ROUTINE 2')
(4, 2, 0, 'CO-ROUTINE 1')
(11, 4, 0, 'SCAN TABLE random_numbers USING INDEX abc')
(51, 2, 0, 'SCAN SUBQUERY 1')
(54, 2, 0, 'USE TEMP B-TREE FOR GROUP BY')
(98, 0, 0, 'SCAN SUBQUERY 2')
(101, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
132.78631687164307
(6, 0, 0, 'SCAN TABLE random_numbers USING COVERING INDEX abcx')
0.0009453296661376953
(2, 0, 0, 'CO-ROUTINE 1')
(8, 2, 0, 'SCAN TABLE random_numbers USING COVERING INDEX abcx')
(47, 0, 0, 'SCAN SUBQUERY 1')
(50, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
4.824275493621826
(2, 0, 0, 'CO-ROUTINE 2')
(4, 2, 0, 'CO-ROUTINE 1')
(10, 4, 0, 'SCAN TABLE random_numbers USING COVERING INDEX abcx')
(49, 2, 0, 'SCAN SUBQUERY 1')
(52, 2, 0, 'USE TEMP B-TREE FOR GROUP BY')
(96, 0, 0, 'SCAN SUBQUERY 2')
(99, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
8.86772608757019
Interpretation
Here is a summary of the run times (in seconds):
no index
(a, b, c) index
covering index
GROUP BY a, b, c
4.88
0.00
0.00
GROUP BY a, b
12.18
131.17
4.82
GROUP BY a
16.62
132.79
8.87
The covering index gives the best results for this table and these queries.
For a single-level GROUP BY, the (a, b, c) index is better than no index.
For multi-level GROUP BYs, no index is better.
Single-level GROUP BY
The execution plan without an index is
(6, 0, 0, 'SCAN TABLE random_numbers')
(8, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
The execution plan for the (a, b, c) index is
(7, 0, 0, 'SCAN TABLE random_numbers USING INDEX abc')
Thus, the index saves me the trouble of building a B-TREE.
Multi-level GROUP BY
The execution plan without an index is
(2, 0, 0, 'CO-ROUTINE 1')
(8, 2, 0, 'SCAN TABLE random_numbers')
(10, 2, 0, 'USE TEMP B-TREE FOR GROUP BY')
(59, 0, 0, 'SCAN SUBQUERY 1')
(62, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
The execution plan for the (a, b, c) index is
(2, 0, 0, 'CO-ROUTINE 1')
(9, 2, 0, 'SCAN TABLE random_numbers USING INDEX abc')
(49, 0, 0, 'SCAN SUBQUERY 1')
(52, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
When scanning the subquery, the savings of the index are outweighed by the frequent look-ups in the table.
Covering index
The execution plan for the covering index is
(2, 0, 0, 'CO-ROUTINE 1')
(8, 2, 0, 'SCAN TABLE random_numbers USING COVERING INDEX abcx')
(47, 0, 0, 'SCAN SUBQUERY 1')
(50, 0, 0, 'USE TEMP B-TREE FOR GROUP BY')
The covering index combines the two advantages above, i.e. (i) no B-TREE, and (ii) no table look-ups.
References
https://www.postgresql.org/docs/current/indexes-index-only-scans.html .
https://sqlite.org/queryplanner.html .
https://dev.mysql.com/doc/refman/8.0/en/mysql-indexes.html .