Skip to content

Commit fbd0f97

Browse files
author
damon
committed
v1.0.0: code review fixes, BatdocError enum, GitHub Actions CI/release
- BatdocError enum (Io, Zip, Document, Render) via thiserror, replacing Box<dyn Error> - All clippy pedantic + nursery fixes (lossless casts, const fn, String::new, etc.) - Shared xml_util module, GridBuilder struct, Record borrowing - GitHub Actions CI: fmt, clippy pedantic+nursery, test, build on push - GitHub Actions release: Linux x86_64 musl static + macOS aarch64, zstd compressed - README updated with curl one-liner install commands - Bumped to v1.0.0
1 parent ef27dd4 commit fbd0f97

12 files changed

Lines changed: 449 additions & 336 deletions

File tree

.github/workflows/ci.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [master]
6+
pull_request:
7+
branches: [master]
8+
9+
env:
10+
CARGO_TERM_COLOR: always
11+
12+
jobs:
13+
check:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/checkout@v4
17+
18+
- uses: dtolnay/rust-toolchain@stable
19+
with:
20+
components: rustfmt, clippy
21+
22+
- uses: Swatinem/rust-cache@v2
23+
24+
- name: fmt
25+
run: cargo fmt -- --check
26+
27+
- name: clippy
28+
run: cargo clippy --all-targets -- -D warnings -W clippy::pedantic -W clippy::nursery
29+
30+
- name: test
31+
run: cargo test
32+
33+
- name: build
34+
run: cargo build --release

.github/workflows/release.yml

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
name: Release
2+
3+
on:
4+
push:
5+
tags: ["v*"]
6+
7+
permissions:
8+
contents: write
9+
10+
env:
11+
CARGO_TERM_COLOR: always
12+
13+
jobs:
14+
build:
15+
strategy:
16+
matrix:
17+
include:
18+
- target: x86_64-unknown-linux-musl
19+
os: ubuntu-latest
20+
artifact: batdoc-linux-x86_64.zst
21+
- target: aarch64-apple-darwin
22+
os: macos-latest
23+
artifact: batdoc-darwin-aarch64.zst
24+
25+
runs-on: ${{ matrix.os }}
26+
27+
steps:
28+
- uses: actions/checkout@v4
29+
30+
- uses: dtolnay/rust-toolchain@stable
31+
with:
32+
targets: ${{ matrix.target }}
33+
34+
- uses: Swatinem/rust-cache@v2
35+
with:
36+
key: ${{ matrix.target }}
37+
38+
- name: Install musl tools
39+
if: matrix.target == 'x86_64-unknown-linux-musl'
40+
run: sudo apt-get update && sudo apt-get install -y musl-tools
41+
42+
- name: Build
43+
run: cargo build --release --target ${{ matrix.target }}
44+
45+
- name: Compress binary
46+
run: zstd -19 target/${{ matrix.target }}/release/batdoc -o ${{ matrix.artifact }}
47+
48+
- name: Upload artifact
49+
uses: actions/upload-artifact@v4
50+
with:
51+
name: ${{ matrix.artifact }}
52+
path: ${{ matrix.artifact }}
53+
54+
release:
55+
needs: build
56+
runs-on: ubuntu-latest
57+
58+
steps:
59+
- uses: actions/download-artifact@v4
60+
with:
61+
merge-multiple: true
62+
63+
- name: Create release
64+
uses: softprops/action-gh-release@v2
65+
with:
66+
generate_release_notes: true
67+
files: |
68+
batdoc-linux-x86_64.zst
69+
batdoc-darwin-aarch64.zst

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "batdoc"
3-
version = "0.2.0"
3+
version = "1.0.0"
44
edition = "2021"
55
description = "Bat for .doc, .docx, .xls, and .xlsx files. Converts Microsoft Office documents to readable markdown."
66

@@ -13,6 +13,7 @@ bat = { version = "0.26.1", default-features = false, features = ["regex-fancy",
1313
cfb = "0.13"
1414
is-terminal = "0.4"
1515
quick-xml = "0.37"
16+
thiserror = "2"
1617
zip = { version = "2", default-features = false, features = ["deflate"] }
1718

1819
[profile.release]

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ by checking for `word/document.xml` vs `xl/workbook.xml`.
1919

2020
## Install
2121

22+
**Linux (x86_64, static musl):**
23+
```
24+
curl -sL https://github.com/daemonp/batdoc/releases/latest/download/batdoc-linux-x86_64.zst | zstd -d > batdoc && chmod +x batdoc
25+
```
26+
27+
**macOS (Apple Silicon):**
28+
```
29+
curl -sL https://github.com/daemonp/batdoc/releases/latest/download/batdoc-darwin-aarch64.zst | zstd -d > batdoc && chmod +x batdoc
30+
```
31+
32+
**From source:**
2233
```
2334
cargo build --release
2435
cp target/release/batdoc ~/.local/bin/

src/doc.rs

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
use cfb::CompoundFile;
1010
use std::io::{Cursor, Read};
1111

12+
use crate::error::BatdocError;
1213
use crate::heuristic;
1314

1415
// FIB flag bits
@@ -22,41 +23,47 @@ const F_EXT_CHAR: u16 = 0x1000;
2223
/// - Numbered lines like "1. Foo" or "1.2 Bar" that are short → headings
2324
/// - Short standalone lines (< 80 chars, no sentence-ending punctuation) → bold
2425
/// - Tab-separated lines with consistent columns → markdown tables
25-
pub fn extract_markdown(data: &[u8]) -> Result<String, Box<dyn std::error::Error>> {
26+
pub fn extract_markdown(data: &[u8]) -> crate::error::Result<String> {
2627
let plain = extract_plain(data)?;
2728
Ok(heuristic::plain_to_markdown(&plain))
2829
}
2930

3031
/// Extract plain text from an OLE2 .doc file.
3132
/// Returns the document text as a String with paragraph separation.
32-
pub fn extract_plain(data: &[u8]) -> Result<String, Box<dyn std::error::Error>> {
33+
pub fn extract_plain(data: &[u8]) -> crate::error::Result<String> {
3334
let cursor = Cursor::new(data);
3435
let mut cfb = CompoundFile::open(cursor)?;
3536

3637
let stream_path = "/WordDocument";
3738
if !cfb.exists(stream_path) {
38-
return Err("not a Word document (no WordDocument stream)".into());
39+
return Err(BatdocError::Document(
40+
"not a Word document (no WordDocument stream)".into(),
41+
));
3942
}
4043

4144
let mut stream = cfb.open_stream(stream_path)?;
4245
let mut buf = Vec::new();
4346
stream.read_to_end(&mut buf)?;
4447

4548
if buf.len() < 32 {
46-
return Err("WordDocument stream too short".into());
49+
return Err(BatdocError::Document(
50+
"WordDocument stream too short".into(),
51+
));
4752
}
4853

4954
let flags = u16::from_le_bytes([buf[10], buf[11]]);
5055

5156
if flags & F_ENCRYPTED != 0 {
52-
return Err("document is encrypted".into());
57+
return Err(BatdocError::Document("document is encrypted".into()));
5358
}
5459

55-
let text_start = u32::from_le_bytes([buf[24], buf[25], buf[26], buf[27]]) as usize;
60+
let text_start = u32::from_le_bytes([buf[24], buf[25], buf[26], buf[27]]) as usize; // u32 → usize: lossless on 32+ bit
5661
let text_end = u32::from_le_bytes([buf[28], buf[29], buf[30], buf[31]]) as usize;
5762

5863
if text_start >= buf.len() || text_end > buf.len() || text_start >= text_end {
59-
return Err("invalid text boundaries in FIB".into());
64+
return Err(BatdocError::Document(
65+
"invalid text boundaries in FIB".into(),
66+
));
6067
}
6168

6269
let text_data = &buf[text_start..text_end];
@@ -120,7 +127,7 @@ pub(crate) fn cp1252_to_unicode(b: u8) -> u16 {
120127
if (0x80..=0x9F).contains(&b) {
121128
CP1252_MAP[(b - 0x80) as usize]
122129
} else {
123-
b as u16
130+
u16::from(b)
124131
}
125132
}
126133

@@ -155,7 +162,7 @@ fn chars_to_text(chars: &[u16]) -> String {
155162
if let Some(hi) = pending_high_surrogate.take() {
156163
if (0xDC00..=0xDFFF).contains(&c) {
157164
// Valid surrogate pair → supplementary plane character
158-
let code = 0x10000 + ((hi as u32 - 0xD800) << 10) + (c as u32 - 0xDC00);
165+
let code = 0x10000 + ((u32::from(hi) - 0xD800) << 10) + (u32::from(c) - 0xDC00);
159166
if let Some(ch) = char::from_u32(code) {
160167
paragraph.push(ch);
161168
}
@@ -201,7 +208,7 @@ fn chars_to_text(chars: &[u16]) -> String {
201208
}
202209
c if c < 0x0020 => {}
203210
c => {
204-
if let Some(ch) = char::from_u32(c as u32) {
211+
if let Some(ch) = char::from_u32(u32::from(c)) {
205212
paragraph.push(ch);
206213
}
207214
}
@@ -226,14 +233,14 @@ mod tests {
226233
#[test]
227234
fn cp1252_ascii_passthrough() {
228235
for b in 0x00..=0x7Fu8 {
229-
assert_eq!(cp1252_to_unicode(b), b as u16);
236+
assert_eq!(cp1252_to_unicode(b), u16::from(b));
230237
}
231238
}
232239

233240
#[test]
234241
fn cp1252_high_passthrough() {
235242
for b in 0xA0..=0xFFu8 {
236-
assert_eq!(cp1252_to_unicode(b), b as u16);
243+
assert_eq!(cp1252_to_unicode(b), u16::from(b));
237244
}
238245
}
239246

0 commit comments

Comments
 (0)