Skip to content

Commit d103a7f

Browse files
committed
Fix bulk_get error handling
* Previously, responses could end with an error sooner than expected if some of the results were of type `not_found`. That could happen because `not_found` responses do not increment the `rcnt` counter, and `success_possible/1` function relied on the `wcnt + rcnt > 0` check. Update the function to return `true` if there are any workers left (`wcnt > 0`) or there is at least one response for each request. * Maintenance mode errors were not masked and could easily leak to the users. That happened because we returned the last error at the moment we detect that we cannot make progress. If that last error was maintenance mode error, we'd emit that. To fix it, keep track of all the errors we got so far, and if there any other errors besides maintenance mode, then return those error.
1 parent 67d9965 commit d103a7f

1 file changed

Lines changed: 46 additions & 6 deletions

File tree

src/fabric/src/fabric_open_revs.erl

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030
r,
3131
args,
3232
reqs,
33-
workers
33+
workers,
34+
errors = []
3435
}).
3536

3637
go(_DbName, [], _Options) ->
@@ -72,6 +73,9 @@ handle_message({rexi_DOWN, _, {_, NodeRef}, _}, _Worker, #st{} = St) ->
7273
Reqs1 = maps:fold(FoldFun, Reqs, DeadWorkers),
7374
Error = {error, {nodedown, <<"progress not possible">>}},
7475
handle_error(Error, St#st{workers = Workers1, reqs = Reqs1});
76+
handle_message({rexi_EXIT, {maintenance_mode, _Node}}, Worker, #st{} = St) ->
77+
% Remove the node to make it easier to de-duplicate later
78+
handle_message(maintenance_mode, Worker, St);
7579
handle_message({rexi_EXIT, Reason}, Worker, #st{} = St) ->
7680
handle_message(Reason, Worker, St);
7781
handle_message({error, Reason}, Worker, #st{} = St) ->
@@ -112,7 +116,7 @@ responses_fold({ArgRef, NewResp}, #{} = Reqs) ->
112116
}
113117
}.
114118

115-
handle_error(Error, #st{workers = Workers, reqs = Reqs} = St) ->
119+
handle_error(Error, #st{workers = Workers, errors = Errors, reqs = Reqs} = St) ->
116120
case success_possible(Reqs) of
117121
true ->
118122
case have_viable_workers(Workers) of
@@ -124,7 +128,8 @@ handle_error(Error, #st{workers = Workers, reqs = Reqs} = St) ->
124128
end;
125129
false ->
126130
stop_workers(Workers),
127-
{error, Error}
131+
% We may have multiple errors but need to pick one, so pick the first
132+
{error, hd(merge_errors(Errors, Error))}
128133
end.
129134

130135
% De-duplicate identical responses as we go along
@@ -142,6 +147,17 @@ sort_key({ok, #doc{id = Id, revs = Revs, deleted = Deleted}}) ->
142147
sort_key(NotFound) ->
143148
NotFound.
144149

150+
% We're trying to hide maintenance mode if possible. So if there are
151+
% non-maintenance mode errors, such as timeouts, etc, we remove the maintenance
152+
% mode from the list, otherwise, we keep it.
153+
%
154+
merge_errors(Errors, Error) ->
155+
Errors1 = lists:uniq([Error | Errors]),
156+
case Errors1 of
157+
[maintenance_mode] -> [maintenance_mode];
158+
[_ | _] -> lists:delete(maintenance_mode, Errors1)
159+
end.
160+
145161
% Build a #{ArgRef => #req{}} map. ArgRef references the initial {{Id, Revs},
146162
% Opts} tuples and the #req{} is a record keeping track of response for just
147163
% that {Id, Revs} pair.
@@ -206,11 +222,17 @@ have_viable_workers(#{} = Workers) ->
206222
map_size(Workers) > 0.
207223

208224
% We can still return success if we either have some waiting workers, or at
209-
% least one response already for each {Id, Revs} pair.
225+
% least one response already for each {Id, Revs} pair. We don't simply check
226+
% for W + R > 0 but check that responses have any entries, as not_found entries
227+
% won't bump the R values.
210228
%
211229
success_possible(#{} = Reqs) ->
212-
Fun = fun(_, #req{wcnt = W, rcnt = R}, Acc) -> Acc andalso W + R > 0 end,
213-
maps:fold(Fun, true, Reqs).
230+
maps:fold(fun success_possible_fold/3, true, Reqs).
231+
232+
success_possible_fold(_Key, #req{}, _Acc = false) ->
233+
false;
234+
success_possible_fold(_Key, #req{wcnt = W, responses = Resps}, _Acc) ->
235+
W > 0 orelse Resps =/= [].
214236

215237
r_met(#{} = Reqs, ExpectedR) ->
216238
Fun = fun(_, #req{rcnt = R}, Acc) -> min(R, Acc) end,
@@ -316,6 +338,8 @@ open_revs_quorum_test_() ->
316338
?TDEF_FE(t_finish_quorum),
317339
?TDEF_FE(t_no_quorum_on_different_responses),
318340
?TDEF_FE(t_no_quorum_on_not_found),
341+
?TDEF_FE(t_not_found_and_maintenance_mode),
342+
?TDEF_FE(t_all_maintenance_mode),
319343
?TDEF_FE(t_done_on_third),
320344
?TDEF_FE(t_all_different_responses),
321345
?TDEF_FE(t_ancestors_merge_correctly),
@@ -400,6 +424,22 @@ t_no_quorum_on_not_found(_) ->
400424
Res2 = handle_message([[foo2(), bar1()]], W3, S2),
401425
?assertEqual({stop, [[foo2(), bar1()]]}, Res2).
402426

427+
t_not_found_and_maintenance_mode(_) ->
428+
S0 = #st{workers = Workers0} = st0(),
429+
[W1, W2, W3] = lists:sort(maps:keys(Workers0)),
430+
{ok, S1} = handle_message([[bazNF()]], W1, S0),
431+
{ok, S2} = handle_message({error, timeout}, W2, S1),
432+
Res = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W3, S2),
433+
?assertEqual({stop, [[bazNF()]]}, Res).
434+
435+
t_all_maintenance_mode(_) ->
436+
S0 = #st{workers = Workers0} = st0(),
437+
[W1, W2, W3] = lists:sort(maps:keys(Workers0)),
438+
{ok, S1} = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W1, S0),
439+
{ok, S2} = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W2, S1),
440+
Res = handle_message({rexi_EXIT, {maintenance_mode, foo}}, W3, S2),
441+
?assertEqual({error, maintenance_mode}, Res).
442+
403443
t_done_on_third(_) ->
404444
S0 = #st{workers = Workers0} = st0(),
405445
[W1, W2, W3] = lists:sort(maps:keys(Workers0)),

0 commit comments

Comments
 (0)