Don’t parse binlog events of unrelated tables#36
Conversation
|
I had a benchmark for a bit that compared filtering events to not filtering them, but it didn't feel worth checking that in: package binlog
import (
"reflect"
"testing"
"unsafe"
gomysql "github.com/go-mysql-org/go-mysql/mysql"
"github.com/go-mysql-org/go-mysql/replication"
)
func benchmarkSetUnexportedField(target interface{}, fieldName string, value interface{}) {
field := reflect.ValueOf(target).Elem().FieldByName(fieldName)
reflect.NewAt(field.Type(), unsafe.Pointer(field.UnsafeAddr())).Elem().Set(reflect.ValueOf(value))
}
func newBenchmarkRowsEvent(schemaName, tableName string) *replication.RowsEvent {
const tableID uint64 = 7
rowsEvent := &replication.RowsEvent{
Version: 1,
}
benchmarkSetUnexportedField(rowsEvent, "tableIDSize", 6)
benchmarkSetUnexportedField(rowsEvent, "needBitmap2", false)
benchmarkSetUnexportedField(rowsEvent, "eventType", replication.WRITE_ROWS_EVENTv1)
benchmarkSetUnexportedField(rowsEvent, "tables", map[uint64]*replication.TableMapEvent{
tableID: {
TableID: tableID,
Schema: []byte(schemaName),
Table: []byte(tableName),
ColumnCount: 1,
ColumnType: []byte{gomysql.MYSQL_TYPE_LONG},
ColumnMeta: []uint16{0},
NullBitmap: []byte{0x00},
},
})
return rowsEvent
}
func newBenchmarkRowsEventHeaderData() []byte {
// RowsEvent v1 header:
// - 6 byte little-endian table id
// - 2 byte flags
// - length-encoded column count = 1
// - 1 byte column bitmap
return []byte{7, 0, 0, 0, 0, 0, 0, 0, 1, 1}
}
func newBenchmarkRowsEventDataWithRows(rowCount int) []byte {
data := newBenchmarkRowsEventHeaderData()
for i := 0; i < rowCount; i++ {
// One MYSQL_TYPE_LONG column:
// - 1 byte null bitmap
// - 4 byte little-endian int32 value
data = append(data, 0, byte(i), byte(i>>8), byte(i>>16), byte(i>>24))
}
return data
}
func BenchmarkRowsEventDecodeFiltering(b *testing.B) {
const rowCount = 1000
data := newBenchmarkRowsEventDataWithRows(rowCount)
b.Run("filtered_table_skips_row_decode", func(b *testing.B) {
rowsEvent := newBenchmarkRowsEvent("testdb", "ignored_table")
b.ReportAllocs()
b.SetBytes(int64(len(data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
rowsEvent.Rows = nil
rowsEvent.SkippedColumns = nil
_, err := rowsEvent.DecodeHeader(data)
if err != nil {
b.Fatal(err)
}
if string(rowsEvent.Table.Schema) != "testdb" || string(rowsEvent.Table.Table) != "ignored_table" {
b.Fatalf("unexpected table: %s.%s", rowsEvent.Table.Schema, rowsEvent.Table.Table)
}
}
})
b.Run("full_rows_event_decode", func(b *testing.B) {
rowsEvent := newBenchmarkRowsEvent("testdb", "wanted_table")
b.ReportAllocs()
b.SetBytes(int64(len(data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
rowsEvent.Rows = nil
rowsEvent.SkippedColumns = nil
if err := rowsEvent.Decode(data); err != nil {
b.Fatal(err)
}
}
})
}go test ./go/binlog/ -run='^$' -bench=BenchmarkRowsEventDecodeFunc -benchmem -benchtime=5s -v
goos: darwin
goarch: arm64
pkg: github.com/github/gh-ost/go/binlog
cpu: Apple M3 Pro
BenchmarkRowsEventDecodeFiltering
BenchmarkRowsEventDecodeFiltering/filtered_table_skips_row_decode
BenchmarkRowsEventDecodeFiltering/filtered_table_skips_row_decode-11 769241822 7.730 ns/op 648137.47 MB/s 0 B/op 0 allocs/op
BenchmarkRowsEventDecodeFiltering/full_rows_event_decode
BenchmarkRowsEventDecodeFiltering/full_rows_event_decode-11 153952 39340 ns/op 127.35 MB/s 137713 B/op 1766 allocs/op
PASS
ok github.com/github/gh-ost/go/binlog 13.471s |
| } | ||
|
|
||
| func NewGoMySQLReader(migrationContext *base.MigrationContext) *GoMySQLReader { | ||
| func NewGoMySQLReader(migrationContext *base.MigrationContext, rowsEventFilters ...RowsEventFilterFunc) *GoMySQLReader { |
There was a problem hiding this comment.
I find it strange we're doing variadic here but were always taking 1 argument, never 0, never 1+
There was a problem hiding this comment.
We don't need it ourselves but it does provide backwards compatibility for external callers. I'm not sure whether we (or the gh-ost maintainers) should care about this, but PRs like this one do give the impression there are people using gh-ost as a library.
There was a problem hiding this comment.
There's a somewhat related binlog optimization pending upstream: github#1687, which should stack nicely with this one 👍
|
The change got merged upstream. We'll get it the next time we sync our fork. |
Description
This change avoids decoding row payloads for DML events on tables that gh-ost is not listening to.
Previously, gh-ost decoded every row event from the binlog before filtering by database/table name in the event streamer. This meant DML activity on unrelated tables still paid the full row decoding cost, even though those events were discarded later.
With this change, gh-ost now uses go-mysql’s
RowsEventDecodeFunchook to inspect the row event header/table-map metadata first. If no registered listener is interested in the event’s database/table, gh-ost skips decoding the row data entirely.script/cibuildreturns with no formatting errors, build errors or unit test errors.