Skip to content

Commit 0c29f21

Browse files
author
Ghislain Fourny
committed
Upgrade to Spark 4.
1 parent a9a357e commit 0c29f21

3 files changed

Lines changed: 50 additions & 3 deletions

File tree

README.md

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ for str in rdd.take(10):
121121
print(str);
122122
123123
# It is also possible to write the output to a file locally or on a cluster. The API is similar to that of Spark dataframes.
124-
# Note that it creates a directory and stores the (potentially very large) output in a shared directory.
124+
# Note that it creates a directory and stores the (potentially very large) output in a sharded directory.
125125
# RumbleDB was already tested with up to 64 AWS machines and 100s of TBs of data.
126126
# Of course the examples below are so small that it makes more sense to process the results locally with Python,
127127
# but this shows how GBs or TBs of data obtained from JSONiq can be written back to disk.
@@ -131,4 +131,51 @@ seq.write().mode("overwrite").parquet("outputparquet");
131131
132132
seq = rumble.jsoniq("1+1");
133133
seq.write().mode("overwrite").text("outputtext");
134+
135+
# A more complex, standalone query
136+
137+
seq = rumble.jsoniq("""
138+
139+
let $stores :=
140+
[
141+
{ "store number" : 1, "state" : "MA" },
142+
{ "store number" : 2, "state" : "MA" },
143+
{ "store number" : 3, "state" : "CA" },
144+
{ "store number" : 4, "state" : "CA" }
145+
]
146+
let $sales := [
147+
{ "product" : "broiler", "store number" : 1, "quantity" : 20 },
148+
{ "product" : "toaster", "store number" : 2, "quantity" : 100 },
149+
{ "product" : "toaster", "store number" : 2, "quantity" : 50 },
150+
{ "product" : "toaster", "store number" : 3, "quantity" : 50 },
151+
{ "product" : "blender", "store number" : 3, "quantity" : 100 },
152+
{ "product" : "blender", "store number" : 3, "quantity" : 150 },
153+
{ "product" : "socks", "store number" : 1, "quantity" : 500 },
154+
{ "product" : "socks", "store number" : 2, "quantity" : 10 },
155+
{ "product" : "shirt", "store number" : 3, "quantity" : 10 }
156+
]
157+
let $join :=
158+
for $store in $stores[], $sale in $sales[]
159+
where $store."store number" = $sale."store number"
160+
return {
161+
"nb" : $store."store number",
162+
"state" : $store.state,
163+
"sold" : $sale.product
164+
}
165+
return [$join]
166+
""");
167+
168+
print(seq.json());
169+
170+
seq = rumble.jsoniq("""
171+
for $product in json-lines("http://rumbledb.org/samples/products-small.json", 10)
172+
group by $store-number := $product.store-number
173+
order by $store-number ascending
174+
return {
175+
"store" : $store-number,
176+
"products" : [ distinct-values($product.product) ]
177+
}
178+
""");
179+
print(seq.json());
180+
134181
```

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "jsoniq"
7-
version = "0.1.0a8"
7+
version = "0.1.0a9"
88
description = "Python edition of RumbleDB, a JSONiq engine"
99
requires-python = ">=3.11"
1010
dependencies = [
11-
"pyspark==3.5.5"
11+
"pyspark==4.0.0"
1212
]
1313
authors = [
1414
{name = "Ghislain Fourny", email = "ghislain.fourny@inf.ethz.ch"},
705 KB
Binary file not shown.

0 commit comments

Comments
 (0)