Skip to content

Commit 45eeaa8

Browse files
authored
Merge pull request #843 from benoitc/fix/h2-pool-concurrency-hang
fix(http2): pooled connection wedges under concurrent load
2 parents cc74915 + 59357ef commit 45eeaa8

5 files changed

Lines changed: 168 additions & 22 deletions

File tree

NEWS.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,24 @@
33
UNRELEASED
44
----------
55

6+
### Bug Fixes
7+
8+
- Fix HTTP/2 pooled connections wedging under sustained concurrent load
9+
(#836). The pool checks out a TCP connection first then upgrades to
10+
SSL+ALPN; `connected(enter)` armed the 2s pool idle timer while the
11+
protocol was still classified as HTTP/1.1, and the timer then fired
12+
on a busy multiplexed HTTP/2 connection, terminating it mid-request.
13+
`init_h2_connection` / `init_h2_after_upgrade` now explicitly cancel
14+
the idle timer. hackney_conn also traps `EXIT` from the linked
15+
`h2_connection` and stays alive briefly in `closed` state so late
16+
calls that raced the pool checkout get a proper error reply instead
17+
of `exit:{normal, _}`. Pool's `checkout_h2` validates the state of
18+
the connection process (not just `is_process_alive`).
19+
- Bump `h2` dependency to 0.4.0.
20+
621
### Refactor
722

8-
- HTTP/2 is now delegated to the `erlang_h2` library (hex `h2` 0.3.0).
23+
- HTTP/2 is now delegated to the `erlang_h2` library (hex `h2` 0.4.0).
924
Hackney no longer ships its own HTTP/2 framing, HPACK codec, or
1025
connection/stream state machine:
1126
- `hackney_http2.erl`, `hackney_http2_machine.erl`, `hackney_hpack.erl`

rebar.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
%% Pure Erlang QUIC + HTTP/3 stack
6161
{quic, "1.0.0"},
6262
%% Pure Erlang HTTP/2 stack
63-
{h2, "0.3.0"},
63+
{h2, "0.4.0"},
6464
{idna, "~>7.1.0"},
6565
{mimerl, "~>1.4"},
6666
{certifi, "~>2.16.0"},

src/hackney_conn.erl

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@
107107

108108
-define(CONNECT_TIMEOUT, 8000).
109109
-define(IDLE_TIMEOUT, infinity).
110+
%% Grace window for pooled hackney_conn in `closed` state, during which
111+
%% late-arriving calls race the pool DOWN cleanup and still get a proper
112+
%% error reply instead of exit:{normal, _}. See issue #836.
113+
-define(CLOSED_GRACE_MS, 50).
110114

111115
%% State data record
112116
-record(conn_data, {
@@ -895,6 +899,10 @@ connected({call, From}, {send_headers, Method, Path, Headers}, Data) ->
895899
%% HTTP/2 owner messages from h2 library
896900
connected(info, {h2, H2Conn, Event}, #conn_data{h2_conn = H2Conn} = Data) ->
897901
handle_h2_event(Event, Data);
902+
%% h2_connection is linked via start_link; trap_exit surfaces its termination
903+
%% as an 'EXIT' signal. Convert to the same cleanup path as the monitor DOWN.
904+
connected(info, {'EXIT', H2Conn, Reason}, #conn_data{h2_conn = H2Conn} = Data) ->
905+
h2_on_closed(Reason, Data#conn_data{h2_conn = undefined, h2_mon = undefined});
898906
connected(info, {'DOWN', Mon, process, _Pid, Reason},
899907
#conn_data{h2_mon = Mon} = Data) ->
900908
h2_on_closed(Reason, Data#conn_data{h2_conn = undefined, h2_mon = undefined});
@@ -1406,15 +1414,23 @@ closed(enter, _OldState, #conn_data{socket = Socket, transport = Transport, pool
14061414
undefined -> ok;
14071415
_ -> Transport:close(Socket)
14081416
end,
1409-
%% For pooled connections, stop the process so pool can clean up
1410-
%% For non-pooled connections, stay alive to allow reconnection
1417+
%% Pooled connections used to stop immediately here, but that made
1418+
%% late-arriving {call, From, {request, _}} messages from workers that
1419+
%% raced the pool checkout race a terminating gen_statem — which
1420+
%% surfaces as `exit:{normal, _}` in the caller (issue #836). Stay
1421+
%% alive briefly so those late calls get a proper `{error, {closed, _}}`
1422+
%% reply via handle_common's invalid_state fallback, then stop.
14111423
case PoolPid of
14121424
undefined ->
14131425
{keep_state, Data#conn_data{socket = undefined}};
14141426
_ ->
1415-
{stop, normal, Data#conn_data{socket = undefined}}
1427+
{keep_state, Data#conn_data{socket = undefined},
1428+
[{state_timeout, ?CLOSED_GRACE_MS, closed_grace_expired}]}
14161429
end;
14171430

1431+
closed(state_timeout, closed_grace_expired, Data) ->
1432+
{stop, normal, Data};
1433+
14181434
closed({call, From}, connect, Data) ->
14191435
%% Allow reconnection from closed state
14201436
#conn_data{
@@ -1576,6 +1592,12 @@ handle_common(info, {select, _Resource, _Ref, ready_input},
15761592
_ = hackney_h3:process(ConnRef),
15771593
keep_state_and_data;
15781594

1595+
%% With trap_exit = true, an EXIT signal from any linked process (other than
1596+
%% h2_conn, handled in connected/3) arrives here. Swallow it rather than
1597+
%% propagating — avoids tearing down the gen_statem on stray links.
1598+
handle_common(info, {'EXIT', _Pid, _Reason}, _State, _Data) ->
1599+
keep_state_and_data;
1600+
15791601
handle_common(info, _Msg, _State, _Data) ->
15801602
keep_state_and_data.
15811603

@@ -2303,13 +2325,18 @@ start_h2_connection(Socket, Data, From, Origin) ->
23032325
h2_mon = Mon,
23042326
h2_streams = #{}
23052327
},
2328+
%% Cancel any pending idle_timeout armed by the
2329+
%% TCP-first connected(enter): HTTP/2 connections
2330+
%% multiplex and stay in `connected`, so the 2s
2331+
%% pool default would kill a busy conn (#836).
2332+
CancelIdle = {state_timeout, infinity, idle_timeout},
23062333
case Origin of
23072334
first_connect ->
23082335
{next_state, connected, NewData,
2309-
[{reply, From, ok}]};
2336+
[CancelIdle, {reply, From, ok}]};
23102337
after_upgrade ->
23112338
{keep_state, NewData,
2312-
[{reply, From, ok}]}
2339+
[CancelIdle, {reply, From, ok}]}
23132340
end;
23142341
{error, WaitErr} ->
23152342
catch h2_connection:close(H2Conn),
@@ -2364,17 +2391,26 @@ do_h2_send(From, Method, Path, Headers, Body, StreamState, Mode, Data) ->
23642391
B when is_binary(B) -> B;
23652392
L -> iolist_to_binary(L)
23662393
end,
2367-
SendRes = case BodyBin of
2368-
<<>> -> h2_connection:send_request_headers(H2Conn, H2Headers, true);
2369-
_ ->
2370-
case h2_connection:send_request_headers(H2Conn, H2Headers, false) of
2371-
{ok, SId} ->
2372-
case h2_connection:send_data(H2Conn, SId, BodyBin, true) of
2373-
ok -> {ok, SId};
2374-
{error, _} = E1 -> E1
2375-
end;
2376-
Err -> Err
2377-
end
2394+
%% h2_connection can die between pool checkout and this call; gen_statem:call
2395+
%% on a dead pid raises exit:noproc. Catch that and normalise to an error
2396+
%% so the caller sees {error, {closed, _}} instead of a gen_statem:call
2397+
%% blowing up (issue #836).
2398+
SendRes = try
2399+
case BodyBin of
2400+
<<>> -> h2_connection:send_request_headers(H2Conn, H2Headers, true);
2401+
_ ->
2402+
case h2_connection:send_request_headers(H2Conn, H2Headers, false) of
2403+
{ok, SId} ->
2404+
case h2_connection:send_data(H2Conn, SId, BodyBin, true) of
2405+
ok -> {ok, SId};
2406+
{error, _} = E1 -> E1
2407+
end;
2408+
Err -> Err
2409+
end
2410+
end
2411+
catch
2412+
exit:{ExitReason, _} -> {error, {closed, ExitReason}};
2413+
exit:ExitReason -> {error, {closed, ExitReason}}
23782414
end,
23792415
case SendRes of
23802416
{ok, StreamId} ->
@@ -2544,6 +2580,9 @@ h2_on_closed(Reason, Data) ->
25442580
{Replies, Data1} = collect_h2_aborts({closed, Reason}, Data),
25452581
Stripped = Data1#conn_data{h2_conn = undefined, h2_mon = undefined,
25462582
socket = undefined},
2583+
%% Transition to closed. For pooled conns, closed(enter,...) keeps the
2584+
%% process alive for ?CLOSED_GRACE_MS so calls from workers that raced
2585+
%% the pool checkout get a proper error reply (issue #836).
25472586
{next_state, closed, Stripped, Replies}.
25482587

25492588
collect_h2_aborts(Err, #conn_data{h2_streams = Streams} = Data) ->

src/hackney_pool.erl

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -490,17 +490,18 @@ handle_call({checkin_sync, Pid, ShouldClose}, _From, State) ->
490490
{reply, ok, State2};
491491

492492
handle_call({checkout_h2, Key}, _From, #state{h2_connections = H2Conns} = State) ->
493-
%% HTTP/2 checkout - return existing connection if available
493+
%% HTTP/2 checkout - return existing connection if available.
494+
%% Liveness check includes both process_alive and gen_statem state, so a
495+
%% hackney_conn that already transitioned to `closed` (e.g. after an h2
496+
%% GOAWAY) but has not yet been removed via 'DOWN' is not handed out.
494497
case maps:get(Key, H2Conns, undefined) of
495498
undefined ->
496499
{reply, none, State};
497500
Pid ->
498-
%% Verify connection is still alive
499-
case erlang:is_process_alive(Pid) of
501+
case h2_conn_usable(Pid) of
500502
true ->
501503
{reply, {ok, Pid}, State};
502504
false ->
503-
%% Connection died, remove from pool
504505
H2Conns2 = maps:remove(Key, H2Conns),
505506
{reply, none, State#state{h2_connections = H2Conns2}}
506507
end
@@ -913,6 +914,20 @@ do_checkin_with_close_flag(Pid, ShouldClose, State) ->
913914
State
914915
end.
915916

917+
%% @private Check that a pooled HTTP/2 conn is alive and in `connected` state.
918+
%% Short timeout so a stuck conn doesn't wedge the pool; any failure → unusable.
919+
h2_conn_usable(Pid) ->
920+
case erlang:is_process_alive(Pid) of
921+
false -> false;
922+
true ->
923+
try hackney_conn:get_state(Pid) of
924+
{ok, connected} -> true;
925+
_ -> false
926+
catch
927+
_:_ -> false
928+
end
929+
end.
930+
916931
%% @private Remove an HTTP/2 connection from the pool
917932
do_unregister_h2(Pid, State) ->
918933
#state{h2_connections = H2Conns, pid_monitors = PidMonitors} = State,
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
%%% Regression tests for issue #836: HTTP/2 pooled shared connection wedges
2+
%%% under concurrent sustained load.
3+
%%%
4+
%%% Before the fix, connected(enter) armed a 2s idle_timeout while the
5+
%%% connection was still classified as HTTP/1.1 (pool checks out TCP, then
6+
%%% upgrades to SSL+ALPN); the timer fired on a perfectly busy H2 conn and
7+
%%% tore it down mid-request, crashing in-flight `gen_statem:call`s with
8+
%%% `exit:{normal, _}`.
9+
-module(hackney_http2_concurrency_tests).
10+
11+
-include_lib("eunit/include/eunit.hrl").
12+
13+
%% Resolve test cert dir from the module's beam location so the paths work
14+
%% regardless of where eunit is run from.
15+
cert_dir() ->
16+
BeamDir = filename:dirname(code:which(?MODULE)),
17+
%% _build/test/lib/hackney/test -> project root -> test/certs
18+
Root = filename:join([BeamDir, "..", "..", "..", "..", ".."]),
19+
filename:join([filename:absname(Root), "test", "certs"]).
20+
21+
concurrent_tight_loop_test_() ->
22+
{timeout, 30, fun run_concurrent_tight_loop/0}.
23+
24+
run_concurrent_tight_loop() ->
25+
_ = application:ensure_all_started(hackney),
26+
_ = application:ensure_all_started(h2),
27+
Handler = fun(Conn, Sid, _M, _P, _H) ->
28+
ok = h2:send_response(Conn, Sid, 200,
29+
[{<<"content-type">>, <<"text/plain">>}]),
30+
ok = h2:send_data(Conn, Sid, <<"ok">>, true)
31+
end,
32+
Certs = cert_dir(),
33+
{ok, Server} = h2:start_server(0, #{
34+
cert => filename:join(Certs, "server.pem"),
35+
key => filename:join(Certs, "server.key"),
36+
handler => Handler
37+
}),
38+
Port = h2:server_port(Server),
39+
Pool = hackney_h2_concurrency_pool,
40+
_ = hackney_pool:start_pool(Pool, [{max_connections, 10}]),
41+
try
42+
URL = iolist_to_binary([<<"https://localhost:">>,
43+
integer_to_list(Port), <<"/">>]),
44+
Opts = [{pool, Pool},
45+
{protocols, [http2]},
46+
{recv_timeout, 5000},
47+
{ssl_options, [{insecure, true}, {verify, verify_none}]}],
48+
{ok, 200, _, _} = hackney:request(get, URL, [], <<>>, Opts),
49+
Parent = self(),
50+
Deadline = erlang:monotonic_time(millisecond) + 3000,
51+
Worker = fun(Name) ->
52+
(fun Self(Count) ->
53+
case erlang:monotonic_time(millisecond) < Deadline of
54+
true ->
55+
case hackney:request(get, URL, [], <<>>, Opts) of
56+
{ok, 200, _, _} -> Self(Count + 1);
57+
Other ->
58+
Parent ! {worker, Name, {error, Count, Other}}
59+
end;
60+
false -> Parent ! {worker, Name, {done, Count}}
61+
end
62+
end)(0)
63+
end,
64+
spawn_link(fun() -> Worker(p1) end),
65+
spawn_link(fun() -> Worker(p2) end),
66+
R1 = receive {worker, p1, V1} -> V1 after 8000 -> stall end,
67+
R2 = receive {worker, p2, V2} -> V2 after 2000 -> stall end,
68+
?assertMatch({done, _}, R1),
69+
?assertMatch({done, _}, R2),
70+
{done, N1} = R1,
71+
{done, N2} = R2,
72+
?assert(N1 > 0),
73+
?assert(N2 > 0)
74+
after
75+
catch hackney_pool:stop_pool(Pool),
76+
catch h2:stop_server(Server)
77+
end.

0 commit comments

Comments
 (0)