1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2010-2018. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21%%
22%% Tests of the RFC3539 watchdog state machine as implemented by
23%% module diameter_watchdog.
24%%
25
26-module(diameter_watchdog_SUITE).
27
28-export([suite/0,
29         all/0,
30         init_per_suite/1,
31         end_per_suite/1]).
32
33%% testcases
34-export([reopen/0, reopen/1, reopen/4, reopen/6,
35         suspect/1, suspect/4,
36         okay/1, okay/4]).
37
38-export([id/1,    %% jitter callback
39         run1/1,
40         abuse/1,
41         abuse/2]).
42
43%% diameter_app callbacks
44-export([peer_up/3,
45         peer_down/3]).
46
47%% diameter_tcp message_cb
48-export([message/3]).
49
50-include("diameter.hrl").
51-include("diameter_ct.hrl").
52
53%% ===========================================================================
54
55-define(util, diameter_util).
56
57-define(BASE, ?DIAMETER_DICT_COMMON).
58-define(REALM, "erlang.org").
59-define(ADDR, {127,0,0,1}).
60
61%% Config for diameter:start_service/2.
62-define(SERVICE(Name),
63        [{'Origin-Host', Name ++ "." ++ ?REALM},
64         {'Origin-Realm', ?REALM},
65         {'Host-IP-Address', [?ADDR]},
66         {'Vendor-Id', 42},
67         {'Product-Name', "OTP/diameter"},
68         {'Auth-Application-Id', [0 = ?BASE:id()]},
69         {application, [{alias, Name},
70                        {dictionary, ?BASE},
71                        {module, ?MODULE}]}]).
72
73%% Watchdog timer as a callback.
74-define(WD(T), {?MODULE, id, [T]}).
75
76%% Watchdog timers used by the testcases.
77-define(WD_TIMERS, [10000, ?WD(10000)]).
78
79%% Watchdog timer of the misbehaving node.
80-define(PEER_WD, 10000).
81
82%% A timeout that ensures one watchdog. To ensure only one watchdog
83%% requires (Wd + 2000) + 1000 < 2*(Wd - 2000) ==> 7000 < Wd for the
84%% case with random jitter.
85-define(ONE_WD(Wd), jitter(Wd,2000) + 1000).
86-define(INFO(T), #diameter_event{info = T}).
87
88%% Receive an event message from diameter.
89-define(EVENT(T),    %% apply to not bind T_
90        apply(fun() ->
91                      receive ?INFO(T = T_) -> log_event(T_) end
92              end,
93              [])).
94
95%% Receive a watchdog event.
96-define(WD_EVENT(Ref), log_wd(element(4, ?EVENT({watchdog, Ref, _, _, _})))).
97-define(WD_EVENT(Ref, Ms),
98        apply(fun() ->
99                      receive ?INFO({watchdog, Ref, _, T_, _}) ->
100                              log_wd(T_)
101                      after Ms ->
102                              false
103                      end
104              end,
105              [])).
106
107%% Log to make failures identifiable.
108-define(LOG(T),     ?LOG("~p", [T])).
109-define(LOG(F,A),   ct:pal("~p: " ++ F, [self() | A])).
110-define(WARN(F,A),  ct:pal(error, "~p: " ++ F, [self() | A])).
111
112%% ===========================================================================
113
114suite() ->
115    [{timetrap, {seconds, 90}}].
116
117all() ->
118    [reopen,
119     suspect,
120     okay].
121
122init_per_suite(Config) ->
123    ok = diameter:start(),
124    Config.
125
126end_per_suite(_Config) ->
127    ok = diameter:stop().
128
129%% ===========================================================================
130%% # reopen/1
131%% ===========================================================================
132
133%% Test the watchdog state machine for the required failover, failback
134%% and reopen behaviour by examining watchdog events.
135
136reopen() ->
137    [{timetrap, {minutes, 5}}]. %% 20 watchdogs @ 15 sec
138
139reopen(_) ->
140    [] = run([[reopen, T, W, N, M]
141              || T <- [listen, connect], %% watchdog to test
142                 W <- ?WD_TIMERS,        %% watchdog_timer value
143                 N <- [0,1,2],           %% DWR's to answer before ignoring
144                 M <- ['DWR', 'DWA', 'RAA']]). %% how to induce failback
145
146reopen(Test, Wd, N, M) ->
147    %% Publish a ref ensure the connecting transport is added only
148    %% once events from the listening transport are subscribed to.
149    Ref = make_ref(),
150    [] = run([[reopen, T, Test, Ref, Wd, N, M] || T <- [listen, connect]]).
151
152%% reopen/6
153
154reopen(Type, Test, Ref, Wd, N, M) ->
155    {SvcName, TRef} = start(Type, Ref, cfg(Type, Test, Wd)),
156    reopen(Type, Test, SvcName, TRef, Wd, N, M).
157
158cfg(Type, Type, Wd) ->
159    {Wd, [], false};
160cfg(_Type, _Test, _Wd) ->
161    {?WD(?PEER_WD), [{okay, 0}], true}.
162
163%% reopen/7
164
165%% The watchdog to be tested.
166reopen(Type, Type, SvcName, Ref, Wd, N, M) ->
167    ?LOG("node ~p", [[Type, SvcName, Ref, Wd, N, M]]),
168
169    %% Connection should come up immediately as a consequence of
170    %% starting the watchdog process. In the accepting case this
171    %% results in a new watchdog on a transport waiting for a new
172    %% connection.
173
174    {initial, okay} = ?WD_EVENT(Ref),
175    ?EVENT({up, Ref, _, _, #diameter_packet{}}),
176
177    %%   OKAY          Timer expires &      Failover()
178    %%                 Pending              SetWatchdog()        SUSPECT
179    %%
180    %% The peer replies to N DWR's before becoming silent, we should
181    %% go down after N+2 watchdog_timer expirations: that is, after
182    %% the first unanswered DWR. Knowing the min/max watchdog timeout
183    %% values gives the time interval in which the event is expected.
184
185    [0,0,0,0] = wd_counts(SvcName),
186
187    {okay, suspect} = ?WD_EVENT(Ref),
188    ?EVENT({down, Ref, _, _}),
189
190    %% N received DWA's
191    [_,_,_,N] = wd_counts(SvcName),
192
193    %%   SUSPECT       Receive DWA          Pending = FALSE
194    %%                                      Failback()
195    %%                                      SetWatchdog()        OKAY
196    %%
197    %%   SUSPECT       Receive non-DWA      Failback()
198    %%                                      SetWatchdog()        OKAY
199    %%
200    %% The peer sends a message before the expiry of another watchdog
201    %% to induce failback.
202
203    {suspect, okay} = ?WD_EVENT(Ref),
204    ?EVENT({up, Ref, _, _}),
205
206    %% N+1 sent DWR's, N/N+1 received DWA's
207    R1 = N+1,
208    A1 = choose(M == 'DWA', R1, N),
209    [R1,_,_,A1] = wd_counts(SvcName),
210
211    %%   OKAY          Timer expires &      SendWatchdog()
212    %%                 !Pending             SetWatchdog()
213    %%                                      Pending = TRUE       OKAY
214    %%
215    %%   OKAY          Timer expires &      Failover()
216    %%                 Pending              SetWatchdog()        SUSPECT
217    %%
218    %% The peer is now ignoring all watchdogs so the connection goes
219    %% back down after either one or two watchdog expiries, depending
220    %% on whether or not DWA restored the connection.
221
222    {okay, suspect} = ?WD_EVENT(Ref),
223    ?EVENT({down, Ref, _, _}),
224
225    %%   SUSPECT       Timer expires        CloseConnection()
226    %%                                      SetWatchdog()        DOWN
227    %%
228    %% Non-response brings the connection down after another timeout.
229
230    {suspect, down} = ?WD_EVENT(Ref),
231
232    R2 = R1 + choose(M == 'DWA', 1, 0),
233    A2 = A1,
234    [R2,_,_,A2] = wd_counts(SvcName),
235
236    %%   DOWN          Timer expires        AttemptOpen()
237    %%                                      SetWatchdog()        DOWN
238    %%
239    %%   DOWN          Connection up        NumDWA = 0
240    %%                                      SendWatchdog()
241    %%                                      SetWatchdog()
242    %%                                      Pending = TRUE       REOPEN
243    %%
244    %% The connection is reestablished after another timeout.
245
246    recv_reopen(Type, Ref),
247
248    %%   REOPEN        Receive non-DWA      Throwaway()          REOPEN
249    %%
250    %%   REOPEN        Receive DWA &        Pending = FALSE
251    %%                 NumDWA < 2           NumDWA++             REOPEN
252    %%
253    %%   REOPEN        Receive DWA &        Pending = FALSE
254    %%                 NumDWA == 2          NumDWA++
255    %%                                      Failback()           OKAY
256    %%
257    %%   REOPEN        Timer expires &      SendWatchdog()
258    %%                 !Pending             SetWatchdog()
259    %%                                      Pending = TRUE       REOPEN
260    %%
261    %% An exchange of 3 watchdogs (the first directly after
262    %% capabilities exchange) brings the connection back up.
263
264    {reopen, okay} = ?WD_EVENT(Ref),
265    ?EVENT({up, Ref, _, _, #diameter_packet{}}),
266
267    %% Three DWR's have been answered.
268    R3 = R2 + 3,
269    A3 = A2 + 3,
270    [R3,_,_,A3] = wd_counts(SvcName),
271
272    %% Non-response brings it down again.
273
274    {okay, suspect} = ?WD_EVENT(Ref),
275    ?EVENT({down, Ref, _, _}),
276    {suspect, down} = ?WD_EVENT(Ref),
277
278    R4 = R3 + 1,
279    A4 = A3,
280    [R4,_,_,A4] = wd_counts(SvcName),
281
282    %% Reestablish after another watchdog.
283
284    recv_reopen(Type, Ref),
285
286    %%   REOPEN        Timer expires &      NumDWA = -1
287    %%                 Pending &            SetWatchdog()
288    %%                 NumDWA >= 0                               REOPEN
289    %%
290    %%   REOPEN        Timer expires &      CloseConnection()
291    %%                 Pending &            SetWatchdog()
292    %%                 NumDWA < 0                                DOWN
293    %%
294    %% Peer is now ignoring all watchdogs go down again after 2
295    %% timeouts.
296
297    {reopen, down} = ?WD_EVENT(Ref);
298
299%% The misbehaving peer.
300reopen(Type, _, SvcName, Ref, Wd, N, M) ->
301    ?LOG("peer ~p", [[Type, SvcName, Ref, Wd, N, M]]),
302
303    %% First transport process.
304    {initial, okay} = ?WD_EVENT(Ref),
305    ?EVENT({up, Ref, _, _, #diameter_packet{}}),
306
307    reg(Ref, SvcName, {SvcName, {Wd,N,M}}),
308
309    {okay, down} = ?WD_EVENT(Ref),
310
311    %% Second transport process.
312    ?EVENT({watchdog, Ref, _, {_, okay}, _}),
313    reg(Ref, SvcName, 3),  %% answer 3 watchdogs then fall silent
314    ?EVENT({watchdog, Ref, _, {_, down}, _}),
315
316    %% Third transport process.
317    ?EVENT({watchdog, Ref, _, {_, okay}, _}),
318    reg(Ref, SvcName, 0),  %% disable outgoing DWA
319    ?EVENT({watchdog, Ref, _, {_, down}, _}),
320
321    ok.
322
323log_wd({From, To} = T) ->
324    ?LOG("~p -> ~p", [From, To]),
325    T.
326
327log_event(E) ->
328    T = element(1,E),
329    T == watchdog orelse ?LOG("~p", [T]),
330    E.
331
332%% recv_reopen/2
333
334recv_reopen(connect, Ref) ->
335    {down, reopen} = ?WD_EVENT(Ref),
336    ?EVENT({reconnect, Ref, _});
337
338recv_reopen(listen, Ref) ->
339    {_, reopen} = ?WD_EVENT(Ref).
340
341%% reg/3
342%%
343%% Lookup the pid of the transport process and publish a term for
344%% message/3 to lookup.
345reg(TRef, SvcName, T) ->
346    TPid = tpid(TRef, diameter:service_info(SvcName, transport)),
347    true = diameter_reg:add_new({?MODULE, TPid, T}).
348
349%% tpid/2
350
351tpid(Ref, [[{ref, Ref},
352            {type, connect},
353            {options, _},
354            {watchdog, _},
355            {peer, _},
356            {apps, _},
357            {caps, _},
358            {port, [{owner, TPid} | _]}
359            | _]]) ->
360    TPid;
361
362tpid(Ref, [[{ref, Ref},
363            {type, listen},
364            {options, _},
365            {accept, As}
366            | _]]) ->
367    [[{watchdog, _},
368      {peer, _},
369      {apps, _},
370      {caps, _},
371      {port, [{owner, TPid} | _]}
372      | _]]
373        = lists:filter(fun([{watchdog, {_,_,S}} | _]) ->
374                               S == okay orelse S == reopen
375                       end,
376                       As),
377    TPid.
378
379%% ===========================================================================
380%% # suspect/1
381%% ===========================================================================
382
383%% Configure transports to require a set number of watchdog timeouts
384%% before moving from OKAY to SUSPECT.
385
386suspect(_) ->
387    [] = run([[abuse, [suspect, N]] || N <- [0,1,3]]).
388
389suspect(Type, Fake, Ref, N)
390  when is_reference(Ref) ->
391    {SvcName, TRef}
392        = start(Type, Ref, {?WD(10000), [{suspect, N}], Fake}),
393    {initial, okay} = ?WD_EVENT(TRef),
394    suspect(TRef, Fake, SvcName, N);
395
396suspect(TRef, true, SvcName, _) ->
397    reg(TRef, SvcName, 0),  %% disable outgoing DWA
398    {okay, _} = ?WD_EVENT(TRef);
399
400suspect(TRef, false, SvcName, 0) ->  %% SUSPECT disabled
401    %% Wait 2+ watchdogs and see that only one watchdog has been sent.
402    false = ?WD_EVENT(TRef, 28000),
403    [1,0,0,0] = wd_counts(SvcName);
404
405suspect(TRef, false, SvcName, N) ->
406    %% Check that no watchdog transition takes place within N+
407    %% watchdogs ...
408    false = ?WD_EVENT(TRef, N*10000+8000),
409    [1,0,0,0] = wd_counts(SvcName),
410    %% ... but that the connection then becomes suspect ...
411    {okay, suspect} = ?WD_EVENT(TRef, 10000),
412    [1,0,0,0] = wd_counts(SvcName),
413    %% ... and goes down.
414    {suspect, down} = ?WD_EVENT(TRef, 18000),
415    [1,0,0,0] = wd_counts(SvcName).
416
417%% abuse/1
418
419abuse(F) ->
420    [] = run([[abuse, F, T] || T <- [listen, connect]]).
421
422abuse(F, [_,_,_|_] = Args) ->
423    ?LOG("~p", [Args]),
424    apply(?MODULE, F, Args);
425
426abuse([F|A], Test) ->
427    Ref = make_ref(),
428    [] = run([[abuse, F, [T, T == Test, Ref] ++ A]
429              || T <- [listen, connect]]);
430
431abuse(F, Test) ->
432    abuse([F], Test).
433
434%% ===========================================================================
435%% # okay/1
436%% ===========================================================================
437
438%% Configure the number of watchdog exchanges before moving from
439%% REOPEN to OKAY.
440
441okay(_) ->
442    [] = run([[abuse, [okay, N]] || N <- [0,2,3]]).
443
444okay(Type, Fake, Ref, N)
445  when is_reference(Ref) ->
446    {SvcName, TRef}
447        = start(Type, Ref, {?WD(10000),
448                            [{okay, choose(Fake, 0, N)}],
449                            Fake}),
450    {initial, okay} = ?WD_EVENT(TRef),
451    okay(TRef,
452         Fake,
453         SvcName,
454         choose(Type == listen, initial, down),
455         N).
456
457okay(TRef, true, SvcName, Down, _) ->
458    reg(TRef, SvcName, 0),  %% disable outgoing DWA
459    {okay, down} = ?WD_EVENT(TRef),
460    {Down, okay} = ?WD_EVENT(TRef),
461    reg(TRef, SvcName, -1), %% enable outgoing DWA
462    {okay, down} = ?WD_EVENT(TRef);
463
464okay(TRef, false, SvcName, Down, N) ->
465    {okay, suspect} = ?WD_EVENT(TRef),
466    [1,0,0,0] = wd_counts(SvcName),
467    {suspect, down} = ?WD_EVENT(TRef),
468    ok(TRef, SvcName, Down, N).
469
470ok(TRef, SvcName, Down, 0) ->
471    %% Connection comes up without watchdog exchange.
472    {Down, okay} = ?WD_EVENT(TRef),
473    [1,0,0,0] = wd_counts(SvcName),
474    %% Wait 2+ watchdog timeouts to see that the connection stays up
475    %% and two watchdogs are exchanged.
476    false = ?WD_EVENT(TRef, 28000),
477    [3,0,0,2] = wd_counts(SvcName);
478
479ok(TRef, SvcName, Down, N) ->
480    %% Connection required watchdog exchange before reaching OKAY.
481    {Down, reopen} = ?WD_EVENT(TRef),
482    {reopen, okay} = ?WD_EVENT(TRef),
483    %% One DWR was sent in moving to expect, plus N more to reopen the
484    %% connection.
485    N1 = N+1,
486    [N1,0,0,N] = wd_counts(SvcName).
487
488%% ===========================================================================
489
490%% wd_counts/1
491
492wd_counts(SvcName) ->
493    [Info] = diameter:service_info(SvcName, transport),
494    {_, Counters} = lists:keyfind(statistics, 1, Info),
495    [proplists:get_value({{0,280,R}, D}, Counters, 0) || D <- [send,recv],
496                                                         R <- [1,0]].
497
498%% start/3
499
500start(Type, Ref, T) ->
501    Name = hostname(),
502    true = diameter:subscribe(Name),
503    ok = diameter:start_service(Name, [{monitor, self()} | ?SERVICE(Name)]),
504    {ok, TRef} = diameter:add_transport(Name, {Type, opts(Type, Ref, T)}),
505    true = diameter_reg:add_new({Type, Ref, Name}),
506    {Name, TRef}.
507
508opts(Type, Ref, {Timer, Config, Fake})
509  when is_boolean(Fake) ->
510    [{transport_module, diameter_tcp},
511     {transport_config, mod(Fake) ++ [{ip, ?ADDR}, {port, 0}]
512                                  ++ cfg(Type, Ref)},
513     {watchdog_timer, Timer},
514     {watchdog_config, Config}].
515
516mod(B) ->
517    [{message_cb, [fun message/3, capx]} || B].
518
519cfg(listen, _) ->
520    [];
521cfg(connect, Ref) ->
522    [{{_, _, SvcName}, _Pid}] = diameter_reg:wait({listen, Ref, '_'}),
523    [[{ref, LRef} | _]] = diameter:service_info(SvcName, transport),
524    [LP] = ?util:lport(tcp, LRef),
525    [{raddr, ?ADDR}, {rport, LP}].
526
527%% ===========================================================================
528
529%% message/3
530
531message(send, Bin, X) ->
532    send(Bin, X);
533
534message(recv, Bin, _) ->
535    [Bin];
536
537message(_, _, _) ->
538    [].
539
540%% send/2
541
542%% First outgoing message from a new transport process is CER/CEA.
543%% Remaining outgoing messages are either DWR or DWA.
544send(Bin, capx) ->
545    <<_:32, _:8, 257:24, _/binary>> = Bin,  %% assert on CER/CEA
546    [Bin, fun message/3, init];
547
548%% Outgoing DWR: fake reception of DWA. Use the fact that AVP values
549%% are ignored. This is to ensure that the peer's watchdog state
550%% transitions are only induced by responses to messages it sends.
551send(<<_:32, 1:1, _:7, 280:24, _:32, EId:32, HId:32, _/binary>>, _) ->
552    Pkt = #diameter_packet{header = #diameter_header{version = 1,
553                                                     end_to_end_id = EId,
554                                                     hop_by_hop_id = HId},
555                           msg = ['DWA', {'Result-Code', 2001},
556                                         {'Origin-Host', "XXX"},
557                                         {'Origin-Realm', ?REALM}]},
558    #diameter_packet{bin = Bin} = diameter_codec:encode(?BASE, Pkt),
559    [recv, Bin];
560
561%% First outgoing DWA.
562send(Bin, init) ->
563    [{{?MODULE, _, T}, _}] = diameter_reg:wait({?MODULE, self(), '_'}),
564    send(Bin, T);
565
566%% First transport process.
567send(Bin, {SvcName, {_,_,_} = T}) ->
568    [{'Origin-Host', _} = OH, {'Origin-Realm', _} = OR | _]
569        = ?SERVICE(SvcName),
570    putr(origin, [OH, OR]),
571    send(Bin, T);
572
573%% Discard DWA, failback after another timeout in the peer.
574send(Bin, {Wd, 0 = No, Msg}) ->
575    Origin = getr(origin),
576    [{defer, ?ONE_WD(Wd), [msg(Msg, Bin, Origin)]}, fun message/3, No];
577
578%% Send DWA while we're in the mood (aka 0 < N).
579send(Bin, {Wd, N, Msg}) ->
580    [Bin, fun message/3, {Wd, N-1, Msg}];
581
582%% Discard DWA.
583send(_Bin, 0 = No) ->
584    [fun message/3, No];
585
586%% Send DWA.
587send(<<_:32, 0:1, _:7, 280:24, _/binary>> = DWA, N) ->
588    [DWA, fun message/3, N-1].
589
590%% msg/2
591
592msg('DWA', Bin, _Origin) ->
593    Bin;
594msg(Msg, _Bin, Origin) ->
595    #diameter_packet{bin = Bin}
596        = diameter_codec:encode(?BASE, msg(Msg, Origin)),
597    Bin.
598
599msg('DWR' = M, T) ->
600    [M | T];
601
602msg('RAA', T) ->
603    ['RAA', {'Session-Id', diameter:session_id("abc")},
604            {'Result-Code', 2001}
605          | T].
606%% An unexpected answer is discarded after passing through the
607%% watchdog state machine.
608
609%% ===========================================================================
610
611peer_up(_SvcName, _Peer, S) ->
612    S.
613
614peer_down(_SvcName, _Peer, S) ->
615    S.
616
617%% ===========================================================================
618
619choose(true, X, _)  -> X;
620choose(false, _, X) -> X.
621
622%% id/1
623%%
624%% Jitter callback.
625
626id(T) ->
627    T.
628
629%% run/1
630%%
631%% A more useful badmatch in case of failure.
632
633run(Fs) ->
634    ?util:run([{?MODULE, [run1, F]} || F <- Fs]).
635
636run1([F|A]) ->
637    ok = try
638             apply(?MODULE, F, A),
639             ok
640         catch
641             E:R:Stack ->
642                 ?WARN("~p", [{A, E, R, Stack}]),
643                 Stack
644         end.
645
646%% jitter/2
647
648jitter(?WD(T), _) ->
649    T;
650jitter(T,D) ->
651    T+D.
652
653%% Generate a unique hostname for the faked peer.
654hostname() ->
655    ?util:unique_string().
656
657putr(Key, Val) ->
658    put({?MODULE, Key}, Val).
659
660getr(Key) ->
661    get({?MODULE, Key}).
662