Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions embedding.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,16 +382,35 @@ func injectShadowVectors(ctx context.Context,
}

jsonTag := field.Tag.Get("json")
predicate := strings.Split(jsonTag, ",")[0]
jsonParts := strings.Split(jsonTag, ",")
predicate := jsonParts[0]
if predicate == "" {
predicate = field.Name
}
hasOmitempty := false
for _, opt := range jsonParts[1:] {
if opt == "omitempty" {
hasOmitempty = true
break
}
}
vecPred := vecShadowPredicate(predicate)

textVal := string(sv.Field(i).Interface().(SimString))
_, _, threshold := parseEmbeddingTag(dgraphTag)

// Below threshold (or empty): delete the shadow vector to avoid stale data.
// Empty + omitempty means the caller didn't set this field on a partial
// update. The primary predicate is omitted from the JSON mutation (so
// Dgraph never sees `fn_doc` and the existing string is preserved); the
// shadow vector must follow the same lifecycle, otherwise partial Updates
// (e.g. setting just an edge predicate) silently wipe vectors written by
// a prior Insert/Upsert. To explicitly clear a SimString, drop omitempty
// so the empty value reaches Dgraph and we delete the shadow vec below.
if textVal == "" && hasOmitempty {
continue
}

// Below threshold (or explicitly cleared): delete the shadow vector to avoid stale data.
if textVal == "" || (threshold > 0 && len([]rune(textVal)) < threshold) {
delNquads = append(delNquads, &api.NQuad{
Subject: uid,
Expand Down
58 changes: 58 additions & 0 deletions embedding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,64 @@ func TestUpdateWithEmbedding(t *testing.T) {
require.Equal(t, "updated text", provider.callLog[1])
}

// TestPartialUpdatePreservesShadowVector is a regression test for the
// omitempty-aware skip in injectShadowVectors. A partial Update of a struct
// that contains a SimString field (e.g. setting only UID + a non-SimString
// field) must not delete the existing shadow vector. The primary predicate
// is preserved by omitempty — Dgraph never sees the SimString in the mutation
// NQuads — so the shadow path must follow the same lifecycle.
//
// Real-world failure that motivated this test: codebase-graph's indexer
// upserts a Function with its doc comment in pass A, then issues a partial
// `cli.Update(&Function{UID, Calls})` in pass B to add CALLS edges. Without
// the omitempty skip, every pass-B Update silently wiped the doc-comment
// vector written in pass A, leaving the shadow predicate sparse and breaking
// similarity ranking once `similar_to`'s k saturated the candidate count.
func TestPartialUpdatePreservesShadowVector(t *testing.T) {
const dims = 4
provider := newMockProvider(dims)
origVec := []float32{1.0, 0.0, 0.0, 0.0}
provider.register("the original description", origVec)

client, cleanup := createEmbeddingClient(t, provider)
defer cleanup()
ctx := context.Background()

p := &embeddableProduct{Name: "Thing", Description: "the original description"}
require.NoError(t, client.Insert(ctx, p))
require.NotEmpty(t, p.UID)
require.Len(t, provider.callLog, 1, "Insert should embed once")

findByOrigVec := func(t *testing.T) string {
t.Helper()
dgoClient, cleanupDgo, err := client.DgraphClient()
require.NoError(t, err)
defer cleanupDgo()
var result embeddableProduct
tx := dg.NewReadOnlyTxn(dgoClient)
err = mg.SimilarTo(tx, &result, "description", origVec, 1).Scan()
require.NoError(t, err)
return result.UID
}

// Sanity: shadow vec is reachable after Insert.
require.Equal(t, p.UID, findByOrigVec(t),
"shadow vec should be present after Insert")

// Partial Update: only UID + a non-SimString field. Description is the
// zero value and is omitted from the mutation by omitempty — the primary
// predicate is preserved, and the shadow vec must be too.
require.NoError(t, client.Update(ctx, &embeddableProduct{UID: p.UID, Name: "Thing v2"}))

// Provider must NOT be re-invoked — there's nothing to embed.
require.Len(t, provider.callLog, 1,
"partial Update must not re-embed when SimString is unset")

// Shadow vec must survive the partial update — this is the regression assertion.
require.Equal(t, p.UID, findByOrigVec(t),
"shadow vec must survive partial Update of unrelated fields")
}

func TestSimilarToQuery(t *testing.T) {
const dims = 5
provider := newMockProvider(dims)
Expand Down
Loading