Support for pandas dataframes.

Ghislain Fourny · Ghislain Fourny · commit 4704062d0d0b · 2025-07-22T10:27:17.000+02:00
diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@ When passing Python values to JSONiq or getting them from a JSONiq queries, the
 
 | Python | JSONiq |
 |-------|-------|
+|tuple|sequence of items|
 |dict|object|
 |list|array|
 |str|string|
@@ -73,6 +74,7 @@ You can directly copy paste the code below to a Python file and execute it with
 
 ```
 from jsoniq import RumbleSession
+import pandas as pd
 
 # The syntax to start a session is similar to that of Spark.
 # A RumbleSession is a SparkSession that additionally knows about RumbleDB.
@@ -155,16 +157,16 @@ print(seq.json());
 ###### Binding JSONiq variables to Python values ###########
 ############################################################
 
-# It is possible to bind a JSONiq variable to a list of native Python values
+# It is possible to bind a JSONiq variable to a tuple of native Python values
 # and then use it in a query.
 # JSONiq, variables are bound to sequences of items, just like the results of JSONiq
 # queries are sequence of items.
-# A Python list will be seamlessly converted to a sequence of items by the library.
+# A Python tuple will be seamlessly converted to a sequence of items by the library.
 # Currently we only support strs, ints, floats, booleans, None, lists, and dicts.
 # But if you need more (like date, bytes, etc) we will add them without any problem.
 # JSONiq has a rich type system.
  
-rumble.bind('$c', [1,2,3,4, 5, 6])
+rumble.bind('$c', (1,2,3,4, 5, 6))
 print(rumble.jsoniq("""
 for $v in $c
 let $parity := $v mod 2
@@ -176,7 +178,7 @@ return { switch($parity)
 }
 """).json())
 
-rumble.bind('$c', [[1,2,3],[4,5,6]])
+rumble.bind('$c', ([1,2,3],[4,5,6]))
 print(rumble.jsoniq("""
 for $i in $c
 return [
@@ -185,18 +187,34 @@ return [
 ]
 """).json())
 
-rumble.bind('$c', [{"foo":[1,2,3]},{"foo":[4,{"bar":[1,False, None]},6]}])
+rumble.bind('$c', ({"foo":[1,2,3]},{"foo":[4,{"bar":[1,False, None]},6]}))
 print(rumble.jsoniq('{ "results" : $c.foo[[2]] }').json())
 
-# It is possible to bind only one value. The it must be provided as a singleton list.
+# It is possible to bind only one value. The it must be provided as a singleton tuple.
 # This is because in JSONiq, an item is the same a sequence of one item.
-rumble.bind('$c', [42])
+rumble.bind('$c', (42,))
 print(rumble.jsoniq('for $i in 1 to $c return $i*$i').json())
 
 # For convenience and code readability, you can also use bindOne().
 rumble.bindOne('$c', 42)
 print(rumble.jsoniq('for $i in 1 to $c return $i*$i').json())
 
+##########################################################
+##### Binding JSONiq variables to pandas DataFrames ######
+##### Getting the output as a Pandas DataFrame      ######
+##########################################################
+
+# Creating a dummy pandas dataframe
+data = {'Name': ['Alice', 'Bob', 'Charlie'],
+        'Age': [30,25,35]};
+pdf = pd.DataFrame(data);
+
+# Binding a pandas dataframe
+rumble.bind('$a',pdf);
+seq = rumble.jsoniq('$a.Name')
+# Getting the output as a pandas dataframe
+print(seq.pdf())
+
 
 ################################################
 ##### Using Pyspark DataFrames with JSONiq #####
@@ -324,6 +342,11 @@ Even more queries can be found [here](https://colab.research.google.com/github/R
 
 # Last updates
 
+## Version 0.1.0 alpha 13
+- Allow to bind JSONiq variables to pandas dataframes
+- Allow to retrieve the output of a JSONiq query as a pandas dataframes (if the output is available as a dataframe, i.e., availableOutputs() returns a list that contains "DataFrame")
+- Clean up the mapping to strictly map tuples to sequence of items, and lists ot array items. This will avoid confusion between arrays and sequences.
+
 ## Version 0.1.0 alpha 12
 - Allow to bind JSONiq variables to Python values (mapping Python lists to sequences of items). This makes it possible to manipulate Python values directly with JSONiq and even without any knowledge of Spark at all.
 - renamed bindDataFrameAsVariable() to bind(), which can be used both with DataFrames and Python lists.
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,8 @@ version = "0.1.0a12"
 description = "Python edition of RumbleDB, a JSONiq engine"
 requires-python = ">=3.11"
 dependencies = [
-    "pyspark==4.0"
+    "pyspark==4.0",
+    "pandas==2.3"
 ]
 authors = [
   {name = "Ghislain Fourny", email = "ghislain.fourny@inf.ethz.ch"},
@@ -23,6 +24,8 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
+  "Typing :: Typed",
+  "License :: OSI Approved :: Apache Software License"
 ]
 
 [tool.setuptools.packages.find]
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-pyspark==4.0.0
+pyspark==4.0
+pandas==2.3
diff --git a/src/jsoniq/sequence.py b/src/jsoniq/sequence.py
@@ -10,7 +10,7 @@ def __init__(self, sequence, sparksession):
         self._sparksession = sparksession
 
     def json(self):
-        return [json.loads(l.serializeAsJSON()) for l in self._jsequence.items()]
+        return tuple([json.loads(l.serializeAsJSON()) for l in self._jsequence.items()])
 
     def rdd(self):
         rdd = self._jsequence.getAsPickledStringRDD();
@@ -20,6 +20,9 @@ def rdd(self):
     def df(self):
         return DataFrame(self._jsequence.getAsDataFrame(), self._sparksession)
 
+    def pdf(self):
+        return self.df().toPandas()
+
     def nextJSON(self):
         return self._jsequence.next().serializeAsJSON()
 
diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py
@@ -4,6 +4,7 @@
 import platform
 import os
 import re
+import pandas as pd
 import importlib.resources as pkg_resources
 
 with pkg_resources.path("jsoniq.jars", "rumbledb-1.24.0.jar") as jar_path:
@@ -84,6 +85,8 @@ def __getattr__(self, name):
     _builder = Builder()
 
     def convert(self, value):
+        if isinstance(value, tuple):
+            return [ self.convert(v) for v in value]
         if isinstance(value, bool):
             return self._sparksession._jvm.org.rumbledb.items.ItemFactory.getInstance().createBooleanItem(value)
         elif isinstance(value, str):
@@ -114,18 +117,30 @@ def bind(self, name: str, valueToBind):
         if not name.startswith("$"):
             raise ValueError("Variable name must start with a dollar symbol ('$').")
         name = name[1:]
-        if isinstance(valueToBind, list):
-            items = [ self.convert(value) for value in valueToBind]
-            conf.setExternalVariableValue(name, items)
-            return self
-        if(hasattr(valueToBind, "_get_object_id")):
+        if isinstance(valueToBind, SequenceOfItems):
+            outputs = valueToBind.availableOutputs()
+            if isinstance(outputs, list) and "DataFrame" in outputs:
+                conf.setExternalVariableValue(name, valueToBind.df());
+            # TODO support binding a variable to an RDD
+            #elif isinstance(outputs, list) and "RDD" in outputs:
+            #    conf.setExternalVariableValue(name, valueToBind.getAsRDD());
+            else:
+                conf.setExternalVariableValue(name, valueToBind.items());
+        elif isinstance(valueToBind, pd.DataFrame):
+            pysparkdf = self._sparksession.createDataFrame(valueToBind)
+            conf.setExternalVariableValue(name, pysparkdf._jdf);
+        elif isinstance(valueToBind, tuple):
+            conf.setExternalVariableValue(name, self.convert(valueToBind))
+        elif isinstance(valueToBind, list):
+            raise ValueError("To avoid confusion, a sequence of items must be provided as a Python tuple, not as a Python list. Lists are mapped to single array items, while tuples are mapped to sequences of items. If you want to bind the variable to one array item, then you need to wrap the provided list inside a singleton tuple and try again, or you can also call bindOne() instead.")
+        elif(hasattr(valueToBind, "_get_object_id")):
             conf.setExternalVariableValue(name, valueToBind);
         else:
             conf.setExternalVariableValue(name, valueToBind._jdf);
         return self;
 
     def bindOne(self, name: str, value):
-        return self.bind(name, [value])
+        return self.bind(name, (value,))
 
     def bindDataFrameAsVariable(self, name: str, df):
         conf = self._jrumblesession.getConfiguration();

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-pyspark==4.0.0`
	`1`	`+pyspark==4.0`
	`2`	`+pandas==2.3`