Skip to content

Commit 6722583

Browse files
committed
init
0 parents  commit 6722583

30 files changed

Lines changed: 7769 additions & 0 deletions

.github/workflows/ci.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
build-and-test:
11+
runs-on: ubuntu-latest
12+
strategy:
13+
matrix:
14+
node-version: [20, 22, 24]
15+
steps:
16+
- uses: actions/checkout@v4
17+
18+
- uses: actions/setup-node@v4
19+
with:
20+
node-version: ${{ matrix.node-version }}
21+
cache: 'npm'
22+
23+
- run: npm ci
24+
25+
- run: npm run build
26+
27+
- run: npm test

.github/workflows/publish.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
name: Publish to npm
2+
3+
on:
4+
release:
5+
types:
6+
- published
7+
8+
jobs:
9+
publish:
10+
runs-on: ubuntu-latest
11+
permissions:
12+
contents: read
13+
id-token: write
14+
steps:
15+
- uses: actions/checkout@v4
16+
17+
- uses: actions/setup-node@v4
18+
with:
19+
node-version: '24'
20+
registry-url: 'https://registry.npmjs.org'
21+
22+
- run: npm ci
23+
24+
- run: npm test
25+
26+
- run: npm run build
27+
28+
- run: npm publish --provenance --access public

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
node_modules
2+
dist/
3+
*.log
4+
.DS_Store
5+
coverage/
6+
.env
7+
.env.*

README.md

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
# @bitofsky/databricks-sql
2+
3+
Databricks SQL client for Node.js that talks directly to the REST API and streams large results efficiently. No SDK lock-in, no warehouse-side streaming bottlenecks.
4+
5+
## Why This Exists
6+
I built this while working on an MCP Server that queries Databricks SQL for large datasets. The Databricks Node.js SDK does not support External Links, so you either roll your own REST client or give up on large results. That immediately created a new problem: dozens of presigned URLs per query, each with chunked data and format-specific quirks (CSV headers, JSON array brackets, Arrow EOS markers).
7+
8+
This project pairs with `@bitofsky/merge-streams` to solve that pain:
9+
- Databricks returns N chunk URLs
10+
- We merge them into one clean stream
11+
- You upload once and return one URL to clients
12+
13+
The goal is simple: stream big results with stable memory usage and without forcing clients to juggle chunks.
14+
15+
## Highlights
16+
- Direct REST calls to Statement Execution API.
17+
- Polls statement execution until completion.
18+
- Efficient external link handling: merge chunks into a single stream.
19+
- `mergeExternalLinks` supports streaming uploads and returns a new StatementResult with a presigned URL.
20+
- `fetchRow`/`fetchAll` support `JSON_OBJECT` (schema-based row mapping).
21+
- External links + JSON_ARRAY are supported for row iteration (streaming JSON parsing).
22+
23+
## Install
24+
```bash
25+
npm install @bitofsky/databricks-sql
26+
```
27+
28+
## Sample (fetchAll)
29+
```ts
30+
import { executeStatement, fetchAll } from '@bitofsky/databricks-sql'
31+
32+
const auth = {
33+
token: process.env.DATABRICKS_TOKEN!,
34+
host: process.env.DATABRICKS_HOST!,
35+
httpPath: process.env.DATABRICKS_HTTP_PATH!,
36+
}
37+
38+
const result = await executeStatement('SELECT 1 AS value', auth)
39+
const rows = await fetchAll(result, auth, { format: 'JSON_OBJECT' })
40+
console.log(rows) // [{ value: 1 }]
41+
```
42+
43+
## Sample (Streaming + Presigned URL)
44+
Stream external links into S3, then return a single presigned URL:
45+
46+
```ts
47+
import { executeStatement, mergeExternalLinks } from '@bitofsky/databricks-sql'
48+
import { GetObjectCommand, PutObjectCommand, S3Client } from '@aws-sdk/client-s3'
49+
import { getSignedUrl } from '@aws-sdk/s3-request-presigner'
50+
51+
const auth = {
52+
token: process.env.DATABRICKS_TOKEN!,
53+
host: process.env.DATABRICKS_HOST!, // e.g. abc.cloud.databricks.com
54+
httpPath: process.env.DATABRICKS_HTTP_PATH!, // e.g. /sql/1.0/warehouses/...
55+
}
56+
57+
const s3 = new S3Client({ region: process.env.AWS_REGION! })
58+
const bucket = process.env.DATABRICKS_SQL_S3_BUCKET!
59+
60+
const result = await executeStatement(
61+
'SELECT * FROM samples.tpch.lineitem LIMIT 100000', // Large result
62+
auth,
63+
{ disposition: 'EXTERNAL_LINKS', format: 'CSV' }
64+
)
65+
66+
const merged = await mergeExternalLinks(result, auth, {
67+
mergeStreamToExternalLink: async (stream) => {
68+
const key = `merged-${Date.now()}.csv`
69+
await s3.send(
70+
new PutObjectCommand({
71+
Bucket: bucket,
72+
Key: key,
73+
Body: stream,
74+
ContentType: 'text/csv',
75+
})
76+
)
77+
78+
const externalLink = await getSignedUrl(
79+
s3,
80+
new GetObjectCommand({ Bucket: bucket, Key: key }),
81+
{ expiresIn: 3600 }
82+
)
83+
84+
return {
85+
externalLink,
86+
byte_count: 0,
87+
expiration: new Date(Date.now() + 3600 * 1000).toISOString(),
88+
}
89+
},
90+
})
91+
92+
console.log(merged.result?.external_links?.[0].external_link) // Presigned URL to merged CSV
93+
```
94+
95+
## Sample (Abort)
96+
Cancel a long-running query and stop polling/streaming:
97+
98+
```ts
99+
import { executeStatement, fetchAll } from '@bitofsky/databricks-sql'
100+
101+
const auth = {
102+
token: process.env.DATABRICKS_TOKEN!,
103+
host: process.env.DATABRICKS_HOST!,
104+
httpPath: process.env.DATABRICKS_HTTP_PATH!,
105+
}
106+
107+
const controller = new AbortController()
108+
const timeout = setTimeout(() => controller.abort(), 5000)
109+
110+
try {
111+
const result = await executeStatement(
112+
'SELECT * FROM samples.tpch.lineitem',
113+
auth,
114+
{ signal: controller.signal }
115+
)
116+
117+
const rows = await fetchAll(result, auth, { signal: controller.signal })
118+
console.log(rows.length)
119+
} finally {
120+
clearTimeout(timeout)
121+
}
122+
```
123+
124+
## API
125+
126+
### AuthInfo
127+
```ts
128+
type AuthInfo = {
129+
token: string
130+
host: string
131+
httpPath: string
132+
}
133+
```
134+
135+
### executeStatement(query, auth, options?)
136+
```ts
137+
function executeStatement(
138+
query: string,
139+
auth: AuthInfo,
140+
options?: ExecuteStatementOptions
141+
): Promise<StatementResult>
142+
```
143+
- Calls the Databricks Statement Execution API and polls until completion.
144+
- Use `options.onProgress` to receive status updates.
145+
- Throws `DatabricksSqlError` on failure, `StatementCancelledError` on cancel, and `AbortError` on abort.
146+
147+
### fetchRow(statementResult, auth, options?)
148+
```ts
149+
function fetchRow(
150+
statementResult: StatementResult,
151+
auth: AuthInfo,
152+
options?: FetchRowsOptions
153+
): Promise<void>
154+
```
155+
- Streams each row to `options.onEachRow`.
156+
- Use `format: 'JSON_OBJECT'` to map rows into schema-based objects.
157+
- Supports `INLINE` results or `JSON_ARRAY` formatted `EXTERNAL_LINKS` only.
158+
159+
### fetchAll(statementResult, auth, options?)
160+
```ts
161+
function fetchAll(
162+
statementResult: StatementResult,
163+
auth: AuthInfo,
164+
options?: FetchAllOptions
165+
): Promise<Array<RowArray | RowObject>>
166+
```
167+
- Collects all rows into an array. For large results, prefer `fetchRow`/`fetchStream`.
168+
- Supports `INLINE` results or `JSON_ARRAY` formatted `EXTERNAL_LINKS` only.
169+
170+
### fetchStream(statementResult, auth, options?)
171+
```ts
172+
function fetchStream(
173+
statementResult: StatementResult,
174+
auth: AuthInfo,
175+
options?: FetchStreamOptions
176+
): Readable
177+
```
178+
- Merges `EXTERNAL_LINKS` into a single binary stream.
179+
- Preserves the original format (`JSON_ARRAY`, `CSV`, `ARROW_STREAM`).
180+
- Ends as an empty stream when no external links exist.
181+
182+
### mergeExternalLinks(statementResult, auth, options)
183+
```ts
184+
function mergeExternalLinks(
185+
statementResult: StatementResult,
186+
auth: AuthInfo,
187+
options: MergeExternalLinksOptions
188+
): Promise<StatementResult>
189+
```
190+
- Creates a merged stream from `EXTERNAL_LINKS`, uploads it via
191+
`options.mergeStreamToExternalLink`, then returns a `StatementResult`
192+
with a single external link.
193+
- Returns the original result unchanged when input is `INLINE`.
194+
195+
### Options (Summary)
196+
```ts
197+
type ExecuteStatementOptions = {
198+
onProgress?: (status: StatementStatus) => void
199+
signal?: AbortSignal
200+
disposition?: 'INLINE' | 'EXTERNAL_LINKS'
201+
format?: 'JSON_ARRAY' | 'ARROW_STREAM' | 'CSV'
202+
wait_timeout?: string
203+
row_limit?: number
204+
byte_limit?: number
205+
catalog?: string
206+
schema?: string
207+
parameters?: StatementParameter[]
208+
on_wait_timeout?: 'CONTINUE' | 'CANCEL'
209+
warehouse_id?: string
210+
}
211+
212+
type FetchRowsOptions = {
213+
signal?: AbortSignal
214+
onEachRow?: (row: RowArray | RowObject) => void
215+
format?: 'JSON_ARRAY' | 'JSON_OBJECT'
216+
}
217+
218+
type FetchAllOptions = {
219+
signal?: AbortSignal
220+
format?: 'JSON_ARRAY' | 'JSON_OBJECT'
221+
}
222+
223+
type FetchStreamOptions = {
224+
signal?: AbortSignal
225+
}
226+
227+
type MergeExternalLinksOptions = {
228+
signal?: AbortSignal
229+
mergeStreamToExternalLink: (stream: Readable) => Promise<{
230+
externalLink: string
231+
byte_count: number
232+
expiration: string
233+
}>
234+
}
235+
```
236+
237+
## Notes
238+
- Databricks requires `INLINE` results to use `JSON_ARRAY` format. `INLINE + CSV` is rejected by the API.
239+
- `EXTERNAL_LINKS` are merged using `@bitofsky/merge-streams`.
240+
241+
## Development
242+
```bash
243+
npm run build:tsc
244+
npm test
245+
```
246+
247+
S3 integration tests require Databricks and AWS credentials in `.env`.

0 commit comments

Comments
 (0)