From cf239d6d87500a7e1863bfa30ba27d3342b0e927 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 19 May 2026 09:53:10 +0000 Subject: [PATCH 1/2] feat(backend/kernel): match Thrift backend's user-visible surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small parity fixes that close 34 of the 66 diffs the Thrift vs Kernel comparator surfaced against Dogfood today (see databricks-driver-test docs/comparator-thrift-vs-kernel-diffs.md). None of these are user-data correctness — they're contract-level differences that affect code reading the cursor's description, catching server errors by class, or calling get_columns with no catalog. 1. PEP-249 description.null_ok always None description_from_arrow_schema previously took null_ok from the Arrow field.nullable bit. The Thrift backend has always reported None for this slot (PEP 249 permits either). Hardcode None so the kernel backend is a drop-in for code that reads cursor.description[i][6]. 2. SqlError maps to ServerOperationError Server-side SQL failures (syntax error, missing object, etc.) used to wrap into the generic DatabaseError on the kernel backend. The Thrift backend raises ServerOperationError for the same shape. ServerOperationError is a DatabaseError subclass, so existing catches of the base class are unaffected — but code that catches the specific subclass now works equivalently. 3. get_columns accepts catalog_name=None Previously rejected at the connector layer with a ProgrammingError. The kernel's list_columns now issues SHOW COLUMNS IN ALL CATALOGS server-side when catalog is None; the response carries catalogName per row so TABLE_CAT is correctly attributed without client-side enumeration. Matches Thrift's getColumns(null, ...) behaviour. Two other comparator-surfaced gaps are out of scope: - INTERVAL columns crashing on fetch (kernel-side fix; intervals need to be stringified post-decode). - _use_arrow_native_complex_types=False not honoured (pre-existing gap shared with the native SEA backend; needs separate work). Signed-off-by: Vikrant Puppala --- src/databricks/sql/backend/kernel/_errors.py | 9 ++- src/databricks/sql/backend/kernel/client.py | 13 ++--- .../sql/backend/kernel/type_mapping.py | 10 ++-- tests/unit/test_kernel_client.py | 58 +++++++++++++------ tests/unit/test_kernel_result_set.py | 11 +++- tests/unit/test_kernel_type_mapping.py | 25 +++++--- 6 files changed, 83 insertions(+), 43 deletions(-) diff --git a/src/databricks/sql/backend/kernel/_errors.py b/src/databricks/sql/backend/kernel/_errors.py index a844ff716..78b542300 100644 --- a/src/databricks/sql/backend/kernel/_errors.py +++ b/src/databricks/sql/backend/kernel/_errors.py @@ -43,9 +43,9 @@ Error, OperationalError, ProgrammingError, + ServerOperationError, ) - try: import databricks_sql_kernel as _kernel # type: ignore[import-not-found] except ImportError as exc: # pragma: no cover - same hint as client.py @@ -76,7 +76,12 @@ "Internal": DatabaseError, "InvalidStatementHandle": ProgrammingError, "NetworkError": OperationalError, - "SqlError": DatabaseError, + # `SqlError` is a server-side query failure (syntax error, missing + # object, etc.) — exactly what the Thrift backend surfaces as + # `ServerOperationError`. Match Thrift's contract so user code that + # catches `ServerOperationError` (a subclass of `DatabaseError`) + # works equivalently with `use_kernel=True`. + "SqlError": ServerOperationError, "Unknown": DatabaseError, } diff --git a/src/databricks/sql/backend/kernel/client.py b/src/databricks/sql/backend/kernel/client.py index b9ff6a738..f10293311 100644 --- a/src/databricks/sql/backend/kernel/client.py +++ b/src/databricks/sql/backend/kernel/client.py @@ -517,14 +517,13 @@ def get_columns( ) -> "ResultSet": if self._kernel_session is None: raise InterfaceError("get_columns requires an open session.") - if not catalog_name: - # Kernel's list_columns requires a catalog (SEA `SHOW - # COLUMNS` cannot span catalogs). Surface the constraint - # explicitly rather than letting the kernel error. - raise ProgrammingError( - "get_columns requires catalog_name on the kernel backend." - ) try: + # `catalog_name=None` is supported: the kernel issues + # `SHOW COLUMNS IN ALL CATALOGS` server-side and the + # response carries `catalogName` per row, so each result + # row's `TABLE_CAT` is correctly attributed. Matches the + # Thrift backend's `getColumns(null, …)` behaviour from + # the user's perspective. stream = self._kernel_session.metadata().list_columns( catalog=catalog_name, schema_pattern=schema_name, diff --git a/src/databricks/sql/backend/kernel/type_mapping.py b/src/databricks/sql/backend/kernel/type_mapping.py index 4bd38e621..19e0f15f4 100644 --- a/src/databricks/sql/backend/kernel/type_mapping.py +++ b/src/databricks/sql/backend/kernel/type_mapping.py @@ -85,9 +85,11 @@ def description_from_arrow_schema(schema: pyarrow.Schema) -> List[Tuple]: """Build a PEP 249 ``description`` list from a pyarrow Schema. Each tuple is ``(name, type_code, display_size, internal_size, - precision, scale, null_ok)``. ``null_ok`` is taken from - ``field.nullable``; the other four are not reported by the - kernel today. + precision, scale, null_ok)``. PEP 249 allows ``null_ok`` to be + either a bool or ``None``; the Thrift backend always reports + ``None``, so we match that here for drop-in parity. The actual + nullability bit is still available via ``schema.field(i).nullable`` + for callers that want it from the Arrow schema directly. """ return [ ( @@ -97,7 +99,7 @@ def description_from_arrow_schema(schema: pyarrow.Schema) -> List[Tuple]: None, None, None, - field.nullable, + None, ) for field in schema ] diff --git a/tests/unit/test_kernel_client.py b/tests/unit/test_kernel_client.py index a9e9c9090..0e2948284 100644 --- a/tests/unit/test_kernel_client.py +++ b/tests/unit/test_kernel_client.py @@ -70,9 +70,9 @@ def __init__( NotSupportedError, OperationalError, ProgrammingError, + ServerOperationError, ) - # --------------------------------------------------------------------------- # Error mapping # --------------------------------------------------------------------------- @@ -93,7 +93,14 @@ def __init__( ("Internal", DatabaseError), ("InvalidStatementHandle", ProgrammingError), ("NetworkError", OperationalError), - ("SqlError", DatabaseError), + # `SqlError` is the kernel's slug for server-side query + # failures (syntax error, missing object, etc.) — exactly the + # case Thrift's backend surfaces as ``ServerOperationError``. + # Match Thrift so user code that catches the specific class + # works equivalently. ``ServerOperationError`` is itself a + # ``DatabaseError`` subclass, so existing catches of the base + # class are unaffected. + ("SqlError", ServerOperationError), ("Unknown", DatabaseError), ], ) @@ -309,20 +316,37 @@ def test_execute_command_rejects_query_tags(): ) -def test_get_columns_requires_catalog(): +def test_get_columns_accepts_none_catalog(): + """The kernel's `list_columns` honours `catalog=None` by issuing + `SHOW COLUMNS IN ALL CATALOGS` server-side. The connector should + pass `None` through rather than rejecting it, matching the Thrift + backend's `getColumns(null, …)` behaviour.""" c = _make_client() c._kernel_session = MagicMock() cursor = MagicMock() cursor.arraysize = 100 cursor.buffer_size_bytes = 1024 - with pytest.raises(ProgrammingError, match="catalog_name"): - c.get_columns( - session_id=MagicMock(), - max_rows=1, - max_bytes=1, - cursor=cursor, - catalog_name=None, - ) + cursor.connection = MagicMock() + # `list_columns` returns a stream the result-set wrapper will try + # to call `arrow_schema()` on; give it a minimal fake. + fake_stream = MagicMock() + fake_stream.arrow_schema.return_value = MagicMock(__iter__=lambda self: iter([])) + c._kernel_session.metadata.return_value.list_columns.return_value = fake_stream + + c.get_columns( + session_id=MagicMock(), + max_rows=1, + max_bytes=1, + cursor=cursor, + catalog_name=None, + ) + # `list_columns` should be called with catalog=None, not rejected. + c._kernel_session.metadata.return_value.list_columns.assert_called_once_with( + catalog=None, + schema_pattern=None, + table_pattern=None, + column_pattern=None, + ) # --------------------------------------------------------------------------- @@ -521,7 +545,9 @@ def test_pyo3_native_exception_wrapped_as_operational_error(): stmt = MagicMock() stmt.execute.side_effect = TypeError("argument 'foo' must be str, not int") c._kernel_session.statement.return_value = stmt - with pytest.raises(OperationalError, match="Unexpected error from databricks_sql_kernel"): + with pytest.raises( + OperationalError, match="Unexpected error from databricks_sql_kernel" + ): c.execute_command( operation="SELECT 1", session_id=MagicMock(), @@ -546,9 +572,7 @@ def test_pyo3_native_exception_wrapped_for_metadata_calls(): cursor.arraysize = 100 cursor.buffer_size_bytes = 1024 with pytest.raises(OperationalError): - c.get_catalogs( - session_id=MagicMock(), max_rows=1, max_bytes=1, cursor=cursor - ) + c.get_catalogs(session_id=MagicMock(), max_rows=1, max_bytes=1, cursor=cursor) # --------------------------------------------------------------------------- @@ -574,9 +598,7 @@ def test_kernel_error_during_result_set_construction_is_mapped(): cursor.arraysize = 100 cursor.buffer_size_bytes = 1024 with pytest.raises(DatabaseError, match="schema unavailable"): - c.get_catalogs( - session_id=MagicMock(), max_rows=1, max_bytes=1, cursor=cursor - ) + c.get_catalogs(session_id=MagicMock(), max_rows=1, max_bytes=1, cursor=cursor) # --------------------------------------------------------------------------- diff --git a/tests/unit/test_kernel_result_set.py b/tests/unit/test_kernel_result_set.py index 87fa4f0d2..9ec69380a 100644 --- a/tests/unit/test_kernel_result_set.py +++ b/tests/unit/test_kernel_result_set.py @@ -77,8 +77,9 @@ def int_schema(): def test_description_built_from_kernel_schema(int_schema): handle = _FakeKernelHandle(int_schema, []) rs = _make_rs(handle) - # null_ok slot reflects pyarrow.Field.nullable (True by default). - assert rs.description == [("n", "bigint", None, None, None, None, True)] + # `null_ok` slot is always `None` to match the Thrift backend's + # PEP-249 description shape. + assert rs.description == [("n", "bigint", None, None, None, None, None)] def test_fetchall_arrow_drains_all_batches(int_schema): @@ -107,7 +108,11 @@ def test_fetchmany_arrow_slices_within_batch(int_schema): def test_fetchmany_arrow_spans_batch_boundary(int_schema): handle = _FakeKernelHandle( int_schema, - [_batch(int_schema, [1, 2]), _batch(int_schema, [3, 4]), _batch(int_schema, [5, 6])], + [ + _batch(int_schema, [1, 2]), + _batch(int_schema, [3, 4]), + _batch(int_schema, [5, 6]), + ], ) rs = _make_rs(handle) t = rs.fetchmany_arrow(5) diff --git a/tests/unit/test_kernel_type_mapping.py b/tests/unit/test_kernel_type_mapping.py index bee1ee7be..a39cf366a 100644 --- a/tests/unit/test_kernel_type_mapping.py +++ b/tests/unit/test_kernel_type_mapping.py @@ -65,16 +65,21 @@ def test_description_from_schema_preserves_field_names_and_order(): ("name", "string"), ("created_at", "timestamp"), ] - # PEP 249 says 7-tuples. We don't report display_size / - # internal_size / precision / scale (all None); ``null_ok`` is - # taken from ``pyarrow.Field.nullable`` — True by default for - # schemas built from (name, type) pairs. + # PEP 249 7-tuples; we report column name and type only. PEP 249 + # allows ``null_ok`` to be ``None`` and that's what the Thrift + # backend has always returned; match it so kernel-backed cursors + # are drop-in compatible. The Arrow ``field.nullable`` bit is still + # available via ``schema.field(i).nullable`` for callers that need + # the real value. for d in desc: assert len(d) == 7 - assert d[2:] == (None, None, None, None, True) + assert d[2:] == (None, None, None, None, None) -def test_description_from_schema_reports_non_nullable_fields(): +def test_description_null_ok_always_none_regardless_of_field_nullable(): + # Match Thrift backend's behaviour: ``null_ok`` is ``None`` for + # every column even when the Arrow ``field.nullable`` bit is + # meaningful. schema = pa.schema( [ pa.field("id", pa.int64(), nullable=False), @@ -82,8 +87,8 @@ def test_description_from_schema_reports_non_nullable_fields(): ] ) desc = description_from_arrow_schema(schema) - assert desc[0][6] is False - assert desc[1][6] is True + assert desc[0][6] is None + assert desc[1][6] is None # ─── bind_tspark_params ────────────────────────────────────────────────── @@ -94,7 +99,9 @@ def _mk_param(*, type, value, ordinal=True, name=None): from databricks.sql.thrift_api.TCLIService import ttypes p = ttypes.TSparkParameter(ordinal=ordinal, name=name, type=type) - p.value = ttypes.TSparkParameterValue(stringValue=value) if value is not None else None + p.value = ( + ttypes.TSparkParameterValue(stringValue=value) if value is not None else None + ) return p From 85c522ffd9c0e3eab93ffce22584ed6d0e192a89 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Tue, 19 May 2026 10:11:30 +0000 Subject: [PATCH 2/2] fix(backend/kernel): expose VARIANT columns as 'variant' in description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel writes the server-reported type name into Arrow field metadata under `databricks.type_name` (see `databricks_sql_kernel::reader::metadata_keys::TYPE_NAME`). `description_from_arrow_schema` now consults it and emits 'variant' when the metadata says so, matching the Thrift backend. Today only VARIANT needs this remap — every other precise type either lands on a dedicated Arrow shape (INT, DECIMAL, …) or collapses on both backends (INTERVAL_*, GEOMETRY, GEOGRAPHY). Closes 3 PREPARED_STATEMENT_TYPES / COMPLEX_TYPES diffs in the comparator. Signed-off-by: Vikrant Puppala --- .../sql/backend/kernel/type_mapping.py | 32 ++++++++++++++++++- tests/unit/test_kernel_type_mapping.py | 23 +++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/databricks/sql/backend/kernel/type_mapping.py b/src/databricks/sql/backend/kernel/type_mapping.py index 19e0f15f4..cc6ccc7b9 100644 --- a/src/databricks/sql/backend/kernel/type_mapping.py +++ b/src/databricks/sql/backend/kernel/type_mapping.py @@ -90,11 +90,23 @@ def description_from_arrow_schema(schema: pyarrow.Schema) -> List[Tuple]: ``None``, so we match that here for drop-in parity. The actual nullability bit is still available via ``schema.field(i).nullable`` for callers that want it from the Arrow schema directly. + + ``type_code`` normally comes from the Arrow ``DataType`` via + ``_arrow_type_to_dbapi_string``, which collapses + Databricks-specific types into their nearest Arrow shape (e.g. + ``VARIANT`` → ``Utf8``). To recover the precise Databricks type + name, we consult the field's metadata first — the kernel writes + the server-reported type into ``databricks.type_name`` (see + ``databricks_sql_kernel::reader::metadata_keys``). Today only + ``VARIANT`` is special-cased here for parity with the Thrift + backend's behaviour; other precise types (``INTERVAL_*``, + ``GEOMETRY``, ``GEOGRAPHY``) collapse to their Arrow shape on + both backends and don't need a remap. """ return [ ( field.name, - _arrow_type_to_dbapi_string(field.type), + _databricks_type_for_field(field), None, None, None, @@ -105,6 +117,24 @@ def description_from_arrow_schema(schema: pyarrow.Schema) -> List[Tuple]: ] +def _databricks_type_for_field(field: pyarrow.Field) -> str: + """Pick the PEP 249 type code for a single field. + + Consults the field's Arrow metadata under + ``databricks.type_name`` (written by the kernel from the SEA + response's column type) so types that collapse onto a generic + Arrow shape can still be distinguished. Today only ``VARIANT`` + is mapped; everything else delegates to + ``_arrow_type_to_dbapi_string``. + """ + md = field.metadata or {} + # `databricks.type_name` is bytes (Arrow metadata is always + # bytes); compare against bytes to avoid one encode per field. + if md.get(b"databricks.type_name") == b"VARIANT": + return "variant" + return _arrow_type_to_dbapi_string(field.type) + + def _tspark_param_value_str(param: ttypes.TSparkParameter) -> Any: """Extract the string-encoded value from a ``TSparkParameter``, or ``None`` for SQL NULL. diff --git a/tests/unit/test_kernel_type_mapping.py b/tests/unit/test_kernel_type_mapping.py index a39cf366a..7d1cce546 100644 --- a/tests/unit/test_kernel_type_mapping.py +++ b/tests/unit/test_kernel_type_mapping.py @@ -91,6 +91,29 @@ def test_description_null_ok_always_none_regardless_of_field_nullable(): assert desc[1][6] is None +def test_description_uses_databricks_type_name_for_variant(): + """VARIANT columns arrive over SEA as Arrow ``Utf8``; the kernel + annotates them with ``databricks.type_name=VARIANT`` so the + connector can recover the precise type for PEP-249 description. + Matches the Thrift backend, which exposes the same column as + ``variant``.""" + schema = pa.schema( + [ + pa.field( + "v", + pa.string(), + metadata={b"databricks.type_name": b"VARIANT"}, + ), + # Plain Utf8 column without the metadata stays ``string`` + # so we don't claim "variant" for everything. + pa.field("s", pa.string()), + ] + ) + desc = description_from_arrow_schema(schema) + assert desc[0][1] == "variant" + assert desc[1][1] == "string" + + # ─── bind_tspark_params ──────────────────────────────────────────────────