Skip to content

Commit b8bade1

Browse files
authored
fix: dropping child table rows during streaming replication (#880)
For partitioned tables, we were only streaming rows for the first child instead of all of them.
1 parent d334760 commit b8bade1

4 files changed

Lines changed: 139 additions & 59 deletions

File tree

integration/resharding/dev.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
44
DEFAULT_BIN="${SCRIPT_DIR}/../../target/debug/pgdog"
55
PGDOG_BIN=${PGDOG_BIN:-$DEFAULT_BIN}
66

7+
# Run in our own process group so we can kill every child on exit.
8+
set -m
9+
cleanup() {
10+
trap - EXIT INT TERM
11+
# Signal every process in this script's process group except ourselves.
12+
pkill -TERM -P $$ 2> /dev/null || true
13+
# Give children a moment to exit cleanly, then force-kill anything left.
14+
sleep 1
15+
pkill -KILL -P $$ 2> /dev/null || true
16+
}
17+
trap cleanup EXIT INT TERM
18+
719
pushd ${SCRIPT_DIR}
820
docker-compose down && docker-compose up -d
921

pgdog/src/backend/replication/logical/error.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ pub enum Error {
131131

132132
#[error("command complete has no rows: {0}")]
133133
CommandCompleteNoRows(CommandComplete),
134+
135+
#[error("missing key in replication stream, out of sync")]
136+
MissingKey,
134137
}
135138

136139
impl From<ErrorResponse> for Error {

pgdog/src/backend/replication/logical/subscriber/stream.rs

Lines changed: 63 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,8 @@ pub struct StreamSubscriber {
131131

132132
// Statements
133133
statements: HashMap<Oid, Statements>,
134-
135-
// Partitioned tables dedup.
136-
partitioned_dedup: HashSet<Key>,
134+
// Mapping of table keys to their oid.
135+
keys: HashMap<Key, Oid>,
137136

138137
// LSNs for each table
139138
table_lsns: HashMap<Oid, i64>,
@@ -171,7 +170,6 @@ impl StreamSubscriber {
171170
cluster,
172171
relations: HashMap::new(),
173172
statements: HashMap::new(),
174-
partitioned_dedup: HashSet::new(),
175173
table_lsns: HashMap::new(),
176174
changed_tables: HashSet::new(),
177175
tables: tables
@@ -193,6 +191,7 @@ impl StreamSubscriber {
193191
in_transaction: false,
194192
query_parser_engine,
195193
missed_rows: MissedRows::default(),
194+
keys: HashMap::default(),
196195
}
197196
}
198197

@@ -466,73 +465,78 @@ impl StreamSubscriber {
466465
name: table.table.destination_name().to_string(),
467466
};
468467

469-
if self.partitioned_dedup.contains(&dest_key) {
470-
debug!("queries for table {} already prepared", dest_key);
471-
return Ok(());
472-
}
468+
// Partition child tables target the parent on the destination shard,
469+
// we don't need to prepare the same statement per child.
470+
if let Some(oid) = self.keys.get(&dest_key) {
471+
let statements = self.statements.get(oid).ok_or(Error::MissingKey)?;
472+
self.statements.insert(relation.oid, statements.clone());
473473

474-
let insert = Statement::new(&table.insert(false), self.query_parser_engine)?;
475-
let upsert = Statement::new(&table.insert(true), self.query_parser_engine)?;
476-
let update = Statement::new(&table.update(), self.query_parser_engine)?;
477-
let delete = Statement::new(&table.delete(), self.query_parser_engine)?;
474+
debug!("queries for table {} already prepared", dest_key);
475+
} else {
476+
let insert = Statement::new(&table.insert(false), self.query_parser_engine)?;
477+
let upsert = Statement::new(&table.insert(true), self.query_parser_engine)?;
478+
let update = Statement::new(&table.update(), self.query_parser_engine)?;
479+
let delete = Statement::new(&table.delete(), self.query_parser_engine)?;
480+
481+
for server in &mut self.connections {
482+
for stmt in &[&insert, &upsert, &update, &delete] {
483+
debug!("preparing \"{}\" [{}]", stmt.query(), server.addr());
484+
}
478485

479-
for server in &mut self.connections {
480-
for stmt in &[&insert, &upsert, &update, &delete] {
481-
debug!("preparing \"{}\" [{}]", stmt.query(), server.addr());
486+
server
487+
.send(
488+
&vec![
489+
insert.parse().clone().into(),
490+
upsert.parse().clone().into(),
491+
update.parse().clone().into(),
492+
delete.parse().clone().into(),
493+
if self.in_transaction {
494+
Flush.into()
495+
} else {
496+
Sync.into()
497+
},
498+
]
499+
.into(),
500+
)
501+
.await?;
482502
}
483503

484-
server
485-
.send(
486-
&vec![
487-
insert.parse().clone().into(),
488-
upsert.parse().clone().into(),
489-
update.parse().clone().into(),
490-
delete.parse().clone().into(),
491-
if self.in_transaction {
492-
Flush.into()
493-
} else {
494-
Sync.into()
495-
},
496-
]
497-
.into(),
498-
)
499-
.await?;
500-
}
501-
502-
for server in &mut self.connections {
503-
let num_messages = if self.in_transaction { 4 } else { 5 };
504-
for _ in 0..num_messages {
505-
let msg = server.read().await?;
506-
trace!("[{}] --> {:?}", server.addr(), msg);
507-
508-
match msg.code() {
509-
'E' => {
510-
return Err(Error::PgError(Box::new(ErrorResponse::from_bytes(
511-
msg.to_bytes()?,
512-
)?)))
504+
for server in &mut self.connections {
505+
let num_messages = if self.in_transaction { 4 } else { 5 };
506+
for _ in 0..num_messages {
507+
let msg = server.read().await?;
508+
trace!("[{}] --> {:?}", server.addr(), msg);
509+
510+
match msg.code() {
511+
'E' => {
512+
return Err(Error::PgError(Box::new(ErrorResponse::from_bytes(
513+
msg.to_bytes()?,
514+
)?)))
515+
}
516+
'Z' => break,
517+
'1' => continue,
518+
c => return Err(Error::RelationOutOfSync(c)),
513519
}
514-
'Z' => break,
515-
'1' => continue,
516-
c => return Err(Error::RelationOutOfSync(c)),
517520
}
518521
}
519-
}
520522

521-
self.statements.insert(
522-
relation.oid,
523-
Statements {
524-
insert,
525-
upsert,
526-
update,
527-
delete,
528-
omni: !table.is_sharded(&self.cluster.sharding_schema().tables),
529-
},
530-
);
523+
self.statements.insert(
524+
relation.oid,
525+
Statements {
526+
insert,
527+
upsert,
528+
update,
529+
delete,
530+
omni: !table.is_sharded(&self.cluster.sharding_schema().tables),
531+
},
532+
);
533+
534+
self.keys.insert(dest_key, relation.oid);
535+
}
531536

532537
// Only record tables we expect to stream changes for.
533538
self.table_lsns.insert(relation.oid, table.lsn.lsn);
534539
self.relations.insert(relation.oid, relation);
535-
self.partitioned_dedup.insert(dest_key);
536540
}
537541

538542
Ok(())

pgdog/src/backend/replication/logical/subscriber/tests.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,67 @@ async fn relation_after_insert_inside_transaction() {
466466
cleanup(&mut verify, "public.sharded_test_b", &[&id_b]).await;
467467
}
468468

469+
/// Two source tables (e.g. partition leaves) that map to the same destination
470+
/// must each register their oid so DML for *both* oids is applied. Regression
471+
/// test for the partition-dedup row-drop bug: previously the second leaf's
472+
/// Relation message returned early without registering its oid in `statements`,
473+
/// causing all subsequent inserts on that oid to be silently dropped.
474+
#[tokio::test]
475+
async fn partition_leaves_share_destination() {
476+
let mut leaf_a = make_sharded_table();
477+
leaf_a.table.name = "sharded_p1".to_string();
478+
leaf_a.table.parent_schema = "public".to_string();
479+
leaf_a.table.parent_name = "sharded".to_string();
480+
481+
let mut leaf_b = make_sharded_table();
482+
leaf_b.table.name = "sharded_p2".to_string();
483+
leaf_b.table.parent_schema = "public".to_string();
484+
leaf_b.table.parent_name = "sharded".to_string();
485+
486+
let cluster = Cluster::new_test_single_shard(&config());
487+
let mut sub = StreamSubscriber::new(&cluster, &[leaf_a, leaf_b], QueryParserEngine::default());
488+
let mut verify = test_server().await;
489+
sub.connect().await.unwrap();
490+
491+
let oid_a = Oid(16384);
492+
let oid_b = Oid(16385);
493+
let id_a = random_id();
494+
let id_b = random_id();
495+
496+
cleanup(&mut verify, "public.sharded", &[&id_a, &id_b]).await;
497+
498+
// Each leaf has its own oid in the WAL stream but resolves to the same
499+
// destination table via parent_schema/parent_name.
500+
let mut relation_a = sharded_relation(oid_a);
501+
relation_a.name = "sharded_p1".to_string();
502+
let mut relation_b = sharded_relation(oid_b);
503+
relation_b.name = "sharded_p2".to_string();
504+
505+
sub.handle(begin_copy_data(100)).await.unwrap();
506+
sub.handle(xlog_copy_data(relation_a.to_bytes().unwrap()))
507+
.await
508+
.unwrap();
509+
sub.handle(xlog_copy_data(relation_b.to_bytes().unwrap()))
510+
.await
511+
.unwrap();
512+
513+
sub.handle(insert_copy_data(oid_a, &id_a, "leaf_a"))
514+
.await
515+
.unwrap();
516+
sub.handle(insert_copy_data(oid_b, &id_b, "leaf_b"))
517+
.await
518+
.unwrap();
519+
520+
sub.handle(commit_copy_data(200)).await.unwrap();
521+
522+
// Both inserts must land in the shared destination table. Before the fix,
523+
// leaf_b's row would be silently dropped.
524+
assert_eq!(count_row(&mut verify, "public.sharded", &id_a).await, 1);
525+
assert_eq!(count_row(&mut verify, "public.sharded", &id_b).await, 1);
526+
527+
cleanup(&mut verify, "public.sharded", &[&id_a, &id_b]).await;
528+
}
529+
469530
// ── Data flow tests ─────────────────────────────────────────────────
470531

471532
/// Full transaction: Begin → Relation → Insert → Commit, verified in Postgres.

0 commit comments

Comments
 (0)