1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2010-2018. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20 21%% 22%% Tests of the RFC3539 watchdog state machine as implemented by 23%% module diameter_watchdog. 24%% 25 26-module(diameter_watchdog_SUITE). 27 28-export([suite/0, 29 all/0, 30 init_per_suite/1, 31 end_per_suite/1]). 32 33%% testcases 34-export([reopen/0, reopen/1, reopen/4, reopen/6, 35 suspect/1, suspect/4, 36 okay/1, okay/4]). 37 38-export([id/1, %% jitter callback 39 run1/1, 40 abuse/1, 41 abuse/2]). 42 43%% diameter_app callbacks 44-export([peer_up/3, 45 peer_down/3]). 46 47%% diameter_tcp message_cb 48-export([message/3]). 49 50-include("diameter.hrl"). 51-include("diameter_ct.hrl"). 52 53%% =========================================================================== 54 55-define(util, diameter_util). 56 57-define(BASE, ?DIAMETER_DICT_COMMON). 58-define(REALM, "erlang.org"). 59-define(ADDR, {127,0,0,1}). 60 61%% Config for diameter:start_service/2. 62-define(SERVICE(Name), 63 [{'Origin-Host', Name ++ "." ++ ?REALM}, 64 {'Origin-Realm', ?REALM}, 65 {'Host-IP-Address', [?ADDR]}, 66 {'Vendor-Id', 42}, 67 {'Product-Name', "OTP/diameter"}, 68 {'Auth-Application-Id', [0 = ?BASE:id()]}, 69 {application, [{alias, Name}, 70 {dictionary, ?BASE}, 71 {module, ?MODULE}]}]). 72 73%% Watchdog timer as a callback. 74-define(WD(T), {?MODULE, id, [T]}). 75 76%% Watchdog timers used by the testcases. 77-define(WD_TIMERS, [10000, ?WD(10000)]). 78 79%% Watchdog timer of the misbehaving node. 80-define(PEER_WD, 10000). 81 82%% A timeout that ensures one watchdog. To ensure only one watchdog 83%% requires (Wd + 2000) + 1000 < 2*(Wd - 2000) ==> 7000 < Wd for the 84%% case with random jitter. 85-define(ONE_WD(Wd), jitter(Wd,2000) + 1000). 86-define(INFO(T), #diameter_event{info = T}). 87 88%% Receive an event message from diameter. 89-define(EVENT(T), %% apply to not bind T_ 90 apply(fun() -> 91 receive ?INFO(T = T_) -> log_event(T_) end 92 end, 93 [])). 94 95%% Receive a watchdog event. 96-define(WD_EVENT(Ref), log_wd(element(4, ?EVENT({watchdog, Ref, _, _, _})))). 97-define(WD_EVENT(Ref, Ms), 98 apply(fun() -> 99 receive ?INFO({watchdog, Ref, _, T_, _}) -> 100 log_wd(T_) 101 after Ms -> 102 false 103 end 104 end, 105 [])). 106 107%% Log to make failures identifiable. 108-define(LOG(T), ?LOG("~p", [T])). 109-define(LOG(F,A), ct:pal("~p: " ++ F, [self() | A])). 110-define(WARN(F,A), ct:pal(error, "~p: " ++ F, [self() | A])). 111 112%% =========================================================================== 113 114suite() -> 115 [{timetrap, {seconds, 90}}]. 116 117all() -> 118 [reopen, 119 suspect, 120 okay]. 121 122init_per_suite(Config) -> 123 ok = diameter:start(), 124 Config. 125 126end_per_suite(_Config) -> 127 ok = diameter:stop(). 128 129%% =========================================================================== 130%% # reopen/1 131%% =========================================================================== 132 133%% Test the watchdog state machine for the required failover, failback 134%% and reopen behaviour by examining watchdog events. 135 136reopen() -> 137 [{timetrap, {minutes, 5}}]. %% 20 watchdogs @ 15 sec 138 139reopen(_) -> 140 [] = run([[reopen, T, W, N, M] 141 || T <- [listen, connect], %% watchdog to test 142 W <- ?WD_TIMERS, %% watchdog_timer value 143 N <- [0,1,2], %% DWR's to answer before ignoring 144 M <- ['DWR', 'DWA', 'RAA']]). %% how to induce failback 145 146reopen(Test, Wd, N, M) -> 147 %% Publish a ref ensure the connecting transport is added only 148 %% once events from the listening transport are subscribed to. 149 Ref = make_ref(), 150 [] = run([[reopen, T, Test, Ref, Wd, N, M] || T <- [listen, connect]]). 151 152%% reopen/6 153 154reopen(Type, Test, Ref, Wd, N, M) -> 155 {SvcName, TRef} = start(Type, Ref, cfg(Type, Test, Wd)), 156 reopen(Type, Test, SvcName, TRef, Wd, N, M). 157 158cfg(Type, Type, Wd) -> 159 {Wd, [], false}; 160cfg(_Type, _Test, _Wd) -> 161 {?WD(?PEER_WD), [{okay, 0}], true}. 162 163%% reopen/7 164 165%% The watchdog to be tested. 166reopen(Type, Type, SvcName, Ref, Wd, N, M) -> 167 ?LOG("node ~p", [[Type, SvcName, Ref, Wd, N, M]]), 168 169 %% Connection should come up immediately as a consequence of 170 %% starting the watchdog process. In the accepting case this 171 %% results in a new watchdog on a transport waiting for a new 172 %% connection. 173 174 {initial, okay} = ?WD_EVENT(Ref), 175 ?EVENT({up, Ref, _, _, #diameter_packet{}}), 176 177 %% OKAY Timer expires & Failover() 178 %% Pending SetWatchdog() SUSPECT 179 %% 180 %% The peer replies to N DWR's before becoming silent, we should 181 %% go down after N+2 watchdog_timer expirations: that is, after 182 %% the first unanswered DWR. Knowing the min/max watchdog timeout 183 %% values gives the time interval in which the event is expected. 184 185 [0,0,0,0] = wd_counts(SvcName), 186 187 {okay, suspect} = ?WD_EVENT(Ref), 188 ?EVENT({down, Ref, _, _}), 189 190 %% N received DWA's 191 [_,_,_,N] = wd_counts(SvcName), 192 193 %% SUSPECT Receive DWA Pending = FALSE 194 %% Failback() 195 %% SetWatchdog() OKAY 196 %% 197 %% SUSPECT Receive non-DWA Failback() 198 %% SetWatchdog() OKAY 199 %% 200 %% The peer sends a message before the expiry of another watchdog 201 %% to induce failback. 202 203 {suspect, okay} = ?WD_EVENT(Ref), 204 ?EVENT({up, Ref, _, _}), 205 206 %% N+1 sent DWR's, N/N+1 received DWA's 207 R1 = N+1, 208 A1 = choose(M == 'DWA', R1, N), 209 [R1,_,_,A1] = wd_counts(SvcName), 210 211 %% OKAY Timer expires & SendWatchdog() 212 %% !Pending SetWatchdog() 213 %% Pending = TRUE OKAY 214 %% 215 %% OKAY Timer expires & Failover() 216 %% Pending SetWatchdog() SUSPECT 217 %% 218 %% The peer is now ignoring all watchdogs so the connection goes 219 %% back down after either one or two watchdog expiries, depending 220 %% on whether or not DWA restored the connection. 221 222 {okay, suspect} = ?WD_EVENT(Ref), 223 ?EVENT({down, Ref, _, _}), 224 225 %% SUSPECT Timer expires CloseConnection() 226 %% SetWatchdog() DOWN 227 %% 228 %% Non-response brings the connection down after another timeout. 229 230 {suspect, down} = ?WD_EVENT(Ref), 231 232 R2 = R1 + choose(M == 'DWA', 1, 0), 233 A2 = A1, 234 [R2,_,_,A2] = wd_counts(SvcName), 235 236 %% DOWN Timer expires AttemptOpen() 237 %% SetWatchdog() DOWN 238 %% 239 %% DOWN Connection up NumDWA = 0 240 %% SendWatchdog() 241 %% SetWatchdog() 242 %% Pending = TRUE REOPEN 243 %% 244 %% The connection is reestablished after another timeout. 245 246 recv_reopen(Type, Ref), 247 248 %% REOPEN Receive non-DWA Throwaway() REOPEN 249 %% 250 %% REOPEN Receive DWA & Pending = FALSE 251 %% NumDWA < 2 NumDWA++ REOPEN 252 %% 253 %% REOPEN Receive DWA & Pending = FALSE 254 %% NumDWA == 2 NumDWA++ 255 %% Failback() OKAY 256 %% 257 %% REOPEN Timer expires & SendWatchdog() 258 %% !Pending SetWatchdog() 259 %% Pending = TRUE REOPEN 260 %% 261 %% An exchange of 3 watchdogs (the first directly after 262 %% capabilities exchange) brings the connection back up. 263 264 {reopen, okay} = ?WD_EVENT(Ref), 265 ?EVENT({up, Ref, _, _, #diameter_packet{}}), 266 267 %% Three DWR's have been answered. 268 R3 = R2 + 3, 269 A3 = A2 + 3, 270 [R3,_,_,A3] = wd_counts(SvcName), 271 272 %% Non-response brings it down again. 273 274 {okay, suspect} = ?WD_EVENT(Ref), 275 ?EVENT({down, Ref, _, _}), 276 {suspect, down} = ?WD_EVENT(Ref), 277 278 R4 = R3 + 1, 279 A4 = A3, 280 [R4,_,_,A4] = wd_counts(SvcName), 281 282 %% Reestablish after another watchdog. 283 284 recv_reopen(Type, Ref), 285 286 %% REOPEN Timer expires & NumDWA = -1 287 %% Pending & SetWatchdog() 288 %% NumDWA >= 0 REOPEN 289 %% 290 %% REOPEN Timer expires & CloseConnection() 291 %% Pending & SetWatchdog() 292 %% NumDWA < 0 DOWN 293 %% 294 %% Peer is now ignoring all watchdogs go down again after 2 295 %% timeouts. 296 297 {reopen, down} = ?WD_EVENT(Ref); 298 299%% The misbehaving peer. 300reopen(Type, _, SvcName, Ref, Wd, N, M) -> 301 ?LOG("peer ~p", [[Type, SvcName, Ref, Wd, N, M]]), 302 303 %% First transport process. 304 {initial, okay} = ?WD_EVENT(Ref), 305 ?EVENT({up, Ref, _, _, #diameter_packet{}}), 306 307 reg(Ref, SvcName, {SvcName, {Wd,N,M}}), 308 309 {okay, down} = ?WD_EVENT(Ref), 310 311 %% Second transport process. 312 ?EVENT({watchdog, Ref, _, {_, okay}, _}), 313 reg(Ref, SvcName, 3), %% answer 3 watchdogs then fall silent 314 ?EVENT({watchdog, Ref, _, {_, down}, _}), 315 316 %% Third transport process. 317 ?EVENT({watchdog, Ref, _, {_, okay}, _}), 318 reg(Ref, SvcName, 0), %% disable outgoing DWA 319 ?EVENT({watchdog, Ref, _, {_, down}, _}), 320 321 ok. 322 323log_wd({From, To} = T) -> 324 ?LOG("~p -> ~p", [From, To]), 325 T. 326 327log_event(E) -> 328 T = element(1,E), 329 T == watchdog orelse ?LOG("~p", [T]), 330 E. 331 332%% recv_reopen/2 333 334recv_reopen(connect, Ref) -> 335 {down, reopen} = ?WD_EVENT(Ref), 336 ?EVENT({reconnect, Ref, _}); 337 338recv_reopen(listen, Ref) -> 339 {_, reopen} = ?WD_EVENT(Ref). 340 341%% reg/3 342%% 343%% Lookup the pid of the transport process and publish a term for 344%% message/3 to lookup. 345reg(TRef, SvcName, T) -> 346 TPid = tpid(TRef, diameter:service_info(SvcName, transport)), 347 true = diameter_reg:add_new({?MODULE, TPid, T}). 348 349%% tpid/2 350 351tpid(Ref, [[{ref, Ref}, 352 {type, connect}, 353 {options, _}, 354 {watchdog, _}, 355 {peer, _}, 356 {apps, _}, 357 {caps, _}, 358 {port, [{owner, TPid} | _]} 359 | _]]) -> 360 TPid; 361 362tpid(Ref, [[{ref, Ref}, 363 {type, listen}, 364 {options, _}, 365 {accept, As} 366 | _]]) -> 367 [[{watchdog, _}, 368 {peer, _}, 369 {apps, _}, 370 {caps, _}, 371 {port, [{owner, TPid} | _]} 372 | _]] 373 = lists:filter(fun([{watchdog, {_,_,S}} | _]) -> 374 S == okay orelse S == reopen 375 end, 376 As), 377 TPid. 378 379%% =========================================================================== 380%% # suspect/1 381%% =========================================================================== 382 383%% Configure transports to require a set number of watchdog timeouts 384%% before moving from OKAY to SUSPECT. 385 386suspect(_) -> 387 [] = run([[abuse, [suspect, N]] || N <- [0,1,3]]). 388 389suspect(Type, Fake, Ref, N) 390 when is_reference(Ref) -> 391 {SvcName, TRef} 392 = start(Type, Ref, {?WD(10000), [{suspect, N}], Fake}), 393 {initial, okay} = ?WD_EVENT(TRef), 394 suspect(TRef, Fake, SvcName, N); 395 396suspect(TRef, true, SvcName, _) -> 397 reg(TRef, SvcName, 0), %% disable outgoing DWA 398 {okay, _} = ?WD_EVENT(TRef); 399 400suspect(TRef, false, SvcName, 0) -> %% SUSPECT disabled 401 %% Wait 2+ watchdogs and see that only one watchdog has been sent. 402 false = ?WD_EVENT(TRef, 28000), 403 [1,0,0,0] = wd_counts(SvcName); 404 405suspect(TRef, false, SvcName, N) -> 406 %% Check that no watchdog transition takes place within N+ 407 %% watchdogs ... 408 false = ?WD_EVENT(TRef, N*10000+8000), 409 [1,0,0,0] = wd_counts(SvcName), 410 %% ... but that the connection then becomes suspect ... 411 {okay, suspect} = ?WD_EVENT(TRef, 10000), 412 [1,0,0,0] = wd_counts(SvcName), 413 %% ... and goes down. 414 {suspect, down} = ?WD_EVENT(TRef, 18000), 415 [1,0,0,0] = wd_counts(SvcName). 416 417%% abuse/1 418 419abuse(F) -> 420 [] = run([[abuse, F, T] || T <- [listen, connect]]). 421 422abuse(F, [_,_,_|_] = Args) -> 423 ?LOG("~p", [Args]), 424 apply(?MODULE, F, Args); 425 426abuse([F|A], Test) -> 427 Ref = make_ref(), 428 [] = run([[abuse, F, [T, T == Test, Ref] ++ A] 429 || T <- [listen, connect]]); 430 431abuse(F, Test) -> 432 abuse([F], Test). 433 434%% =========================================================================== 435%% # okay/1 436%% =========================================================================== 437 438%% Configure the number of watchdog exchanges before moving from 439%% REOPEN to OKAY. 440 441okay(_) -> 442 [] = run([[abuse, [okay, N]] || N <- [0,2,3]]). 443 444okay(Type, Fake, Ref, N) 445 when is_reference(Ref) -> 446 {SvcName, TRef} 447 = start(Type, Ref, {?WD(10000), 448 [{okay, choose(Fake, 0, N)}], 449 Fake}), 450 {initial, okay} = ?WD_EVENT(TRef), 451 okay(TRef, 452 Fake, 453 SvcName, 454 choose(Type == listen, initial, down), 455 N). 456 457okay(TRef, true, SvcName, Down, _) -> 458 reg(TRef, SvcName, 0), %% disable outgoing DWA 459 {okay, down} = ?WD_EVENT(TRef), 460 {Down, okay} = ?WD_EVENT(TRef), 461 reg(TRef, SvcName, -1), %% enable outgoing DWA 462 {okay, down} = ?WD_EVENT(TRef); 463 464okay(TRef, false, SvcName, Down, N) -> 465 {okay, suspect} = ?WD_EVENT(TRef), 466 [1,0,0,0] = wd_counts(SvcName), 467 {suspect, down} = ?WD_EVENT(TRef), 468 ok(TRef, SvcName, Down, N). 469 470ok(TRef, SvcName, Down, 0) -> 471 %% Connection comes up without watchdog exchange. 472 {Down, okay} = ?WD_EVENT(TRef), 473 [1,0,0,0] = wd_counts(SvcName), 474 %% Wait 2+ watchdog timeouts to see that the connection stays up 475 %% and two watchdogs are exchanged. 476 false = ?WD_EVENT(TRef, 28000), 477 [3,0,0,2] = wd_counts(SvcName); 478 479ok(TRef, SvcName, Down, N) -> 480 %% Connection required watchdog exchange before reaching OKAY. 481 {Down, reopen} = ?WD_EVENT(TRef), 482 {reopen, okay} = ?WD_EVENT(TRef), 483 %% One DWR was sent in moving to expect, plus N more to reopen the 484 %% connection. 485 N1 = N+1, 486 [N1,0,0,N] = wd_counts(SvcName). 487 488%% =========================================================================== 489 490%% wd_counts/1 491 492wd_counts(SvcName) -> 493 [Info] = diameter:service_info(SvcName, transport), 494 {_, Counters} = lists:keyfind(statistics, 1, Info), 495 [proplists:get_value({{0,280,R}, D}, Counters, 0) || D <- [send,recv], 496 R <- [1,0]]. 497 498%% start/3 499 500start(Type, Ref, T) -> 501 Name = hostname(), 502 true = diameter:subscribe(Name), 503 ok = diameter:start_service(Name, [{monitor, self()} | ?SERVICE(Name)]), 504 {ok, TRef} = diameter:add_transport(Name, {Type, opts(Type, Ref, T)}), 505 true = diameter_reg:add_new({Type, Ref, Name}), 506 {Name, TRef}. 507 508opts(Type, Ref, {Timer, Config, Fake}) 509 when is_boolean(Fake) -> 510 [{transport_module, diameter_tcp}, 511 {transport_config, mod(Fake) ++ [{ip, ?ADDR}, {port, 0}] 512 ++ cfg(Type, Ref)}, 513 {watchdog_timer, Timer}, 514 {watchdog_config, Config}]. 515 516mod(B) -> 517 [{message_cb, [fun message/3, capx]} || B]. 518 519cfg(listen, _) -> 520 []; 521cfg(connect, Ref) -> 522 [{{_, _, SvcName}, _Pid}] = diameter_reg:wait({listen, Ref, '_'}), 523 [[{ref, LRef} | _]] = diameter:service_info(SvcName, transport), 524 [LP] = ?util:lport(tcp, LRef), 525 [{raddr, ?ADDR}, {rport, LP}]. 526 527%% =========================================================================== 528 529%% message/3 530 531message(send, Bin, X) -> 532 send(Bin, X); 533 534message(recv, Bin, _) -> 535 [Bin]; 536 537message(_, _, _) -> 538 []. 539 540%% send/2 541 542%% First outgoing message from a new transport process is CER/CEA. 543%% Remaining outgoing messages are either DWR or DWA. 544send(Bin, capx) -> 545 <<_:32, _:8, 257:24, _/binary>> = Bin, %% assert on CER/CEA 546 [Bin, fun message/3, init]; 547 548%% Outgoing DWR: fake reception of DWA. Use the fact that AVP values 549%% are ignored. This is to ensure that the peer's watchdog state 550%% transitions are only induced by responses to messages it sends. 551send(<<_:32, 1:1, _:7, 280:24, _:32, EId:32, HId:32, _/binary>>, _) -> 552 Pkt = #diameter_packet{header = #diameter_header{version = 1, 553 end_to_end_id = EId, 554 hop_by_hop_id = HId}, 555 msg = ['DWA', {'Result-Code', 2001}, 556 {'Origin-Host', "XXX"}, 557 {'Origin-Realm', ?REALM}]}, 558 #diameter_packet{bin = Bin} = diameter_codec:encode(?BASE, Pkt), 559 [recv, Bin]; 560 561%% First outgoing DWA. 562send(Bin, init) -> 563 [{{?MODULE, _, T}, _}] = diameter_reg:wait({?MODULE, self(), '_'}), 564 send(Bin, T); 565 566%% First transport process. 567send(Bin, {SvcName, {_,_,_} = T}) -> 568 [{'Origin-Host', _} = OH, {'Origin-Realm', _} = OR | _] 569 = ?SERVICE(SvcName), 570 putr(origin, [OH, OR]), 571 send(Bin, T); 572 573%% Discard DWA, failback after another timeout in the peer. 574send(Bin, {Wd, 0 = No, Msg}) -> 575 Origin = getr(origin), 576 [{defer, ?ONE_WD(Wd), [msg(Msg, Bin, Origin)]}, fun message/3, No]; 577 578%% Send DWA while we're in the mood (aka 0 < N). 579send(Bin, {Wd, N, Msg}) -> 580 [Bin, fun message/3, {Wd, N-1, Msg}]; 581 582%% Discard DWA. 583send(_Bin, 0 = No) -> 584 [fun message/3, No]; 585 586%% Send DWA. 587send(<<_:32, 0:1, _:7, 280:24, _/binary>> = DWA, N) -> 588 [DWA, fun message/3, N-1]. 589 590%% msg/2 591 592msg('DWA', Bin, _Origin) -> 593 Bin; 594msg(Msg, _Bin, Origin) -> 595 #diameter_packet{bin = Bin} 596 = diameter_codec:encode(?BASE, msg(Msg, Origin)), 597 Bin. 598 599msg('DWR' = M, T) -> 600 [M | T]; 601 602msg('RAA', T) -> 603 ['RAA', {'Session-Id', diameter:session_id("abc")}, 604 {'Result-Code', 2001} 605 | T]. 606%% An unexpected answer is discarded after passing through the 607%% watchdog state machine. 608 609%% =========================================================================== 610 611peer_up(_SvcName, _Peer, S) -> 612 S. 613 614peer_down(_SvcName, _Peer, S) -> 615 S. 616 617%% =========================================================================== 618 619choose(true, X, _) -> X; 620choose(false, _, X) -> X. 621 622%% id/1 623%% 624%% Jitter callback. 625 626id(T) -> 627 T. 628 629%% run/1 630%% 631%% A more useful badmatch in case of failure. 632 633run(Fs) -> 634 ?util:run([{?MODULE, [run1, F]} || F <- Fs]). 635 636run1([F|A]) -> 637 ok = try 638 apply(?MODULE, F, A), 639 ok 640 catch 641 E:R:Stack -> 642 ?WARN("~p", [{A, E, R, Stack}]), 643 Stack 644 end. 645 646%% jitter/2 647 648jitter(?WD(T), _) -> 649 T; 650jitter(T,D) -> 651 T+D. 652 653%% Generate a unique hostname for the faked peer. 654hostname() -> 655 ?util:unique_string(). 656 657putr(Key, Val) -> 658 put({?MODULE, Key}, Val). 659 660getr(Key) -> 661 get({?MODULE, Key}). 662