-
Notifications
You must be signed in to change notification settings - Fork 152
Expand file tree
/
Copy pathq17_small_quantity_order.py
More file actions
89 lines (76 loc) · 3.13 KB
/
q17_small_quantity_order.py
File metadata and controls
89 lines (76 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
TPC-H Problem Statement Query 17:
The Small-Quantity-Order Revenue Query considers parts of a given brand and with a given container
type and determines the average lineitem quantity of such parts ordered for all orders (past and
pending) in the 7-year database. What would be the average yearly gross (undiscounted) loss in
revenue if orders for these parts with a quantity of less than 20% of this average were no longer
taken?
The above problem statement text is copyrighted by the Transaction Processing Performance Council
as part of their TPC Benchmark H Specification revision 2.18.0.
Reference SQL (from TPC-H specification, used by the benchmark suite)::
select
sum(l_extendedprice) / 7.0 as avg_yearly
from
lineitem,
part
where
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container = 'MED BOX'
and l_quantity < (
select
0.2 * avg(l_quantity)
from
lineitem
where
l_partkey = p_partkey
);
"""
from datafusion import SessionContext, WindowFrame, col, lit
from datafusion import functions as F
from datafusion.expr import Window
from util import get_data_path
BRAND = "Brand#23"
CONTAINER = "MED BOX"
# Load the dataframes we need
ctx = SessionContext()
df_part = ctx.read_parquet(get_data_path("part.parquet")).select(
"p_partkey", "p_brand", "p_container"
)
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
"l_partkey", "l_quantity", "l_extendedprice"
)
# Limit to parts of the target brand/container, join their line items, and
# attach the per-part average quantity via a partitioned window function —
# the DataFrame form of the SQL's correlated ``avg(l_quantity)`` subquery.
whole_frame = WindowFrame("rows", None, None)
df = (
df_part.filter(col("p_brand") == BRAND, col("p_container") == CONTAINER)
.join(df_lineitem, left_on="p_partkey", right_on="l_partkey")
.with_column(
"avg_quantity",
F.avg(col("l_quantity")).over(
Window(partition_by=[col("l_partkey")], window_frame=whole_frame)
),
)
.filter(col("l_quantity") < lit(0.2) * col("avg_quantity"))
.aggregate([], [F.sum(col("l_extendedprice")).alias("total")])
.select((col("total") / lit(7.0)).alias("avg_yearly"))
)
df.show()