1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2019-2021. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21-module(snmp_test_global_sys_monitor).
22
23-export([start/0, stop/0,
24         reset_events/0,
25         events/0,
26         log/1]).
27-export([init/1]).
28
29-include("snmp_test_lib.hrl").
30
31-define(NAME,    ?MODULE).
32-define(TIMEOUT, timer:seconds(6)).
33
34
35%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
36
37start() ->
38    Parent = self(),
39    proc_lib:start(?MODULE, init, [Parent]).
40
41stop() ->
42    cast(stop).
43
44%% This does not reset the global counter but the "collector"
45%% See events for more info.
46reset_events() ->
47    call(reset_events, ?TIMEOUT).
48
49events() ->
50    call(events, ?TIMEOUT).
51
52log(Event) ->
53    cast({node(), Event}).
54
55
56%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
57
58init(Parent) ->
59    process_flag(priority, high),
60    case global:register_name(?NAME, self()) of
61        yes ->
62            info_msg("Starting as ~p (on ~p)", [self(), node()]),
63            proc_lib:init_ack(Parent, {ok, self()}),
64            loop(#{parent => Parent, ev_cnt => 0, evs => []});
65        no ->
66            warning_msg("Already started", []),
67            proc_lib:init_ack(Parent, {error, already_started}),
68            exit(normal)
69    end.
70
71
72%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
73
74loop(State) ->
75    receive
76        {?MODULE, stop} ->
77            warning_msg("Stopping with ~w events counted",
78                        [maps:get(ev_cnt, State)]),
79            exit(normal);
80
81        {?MODULE, Ref, From, reset_events} ->
82            TotEvCnt = maps:get(ev_cnt, State),
83            EvCnt    = length(maps:get(evs, State)),
84            info_msg("Reset events when"
85                     "~n   Total Number of Events:   ~p"
86                     "~n   Current Number of Events: ~p",
87                     [TotEvCnt, EvCnt]),
88            From ! {?MODULE, Ref, {ok, {TotEvCnt, EvCnt}}},
89            loop(State#{evs => []});
90
91        {?MODULE, Ref, From, events} ->
92            Evs = maps:get(evs, State),
93            From ! {?MODULE, Ref, lists:reverse(Evs)},
94            loop(State);
95
96        {?MODULE, {Node, Event}} ->
97            State2 = process_event(State, Node, Event),
98            loop(State2);
99
100        {nodedown = Event, Node} ->
101            State2 = process_event(State, Node, Event),
102            loop(State2);
103
104        _ ->
105            loop(State)
106    end.
107
108
109%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
110
111process_event(State, Node, {Pid, TS, Tag, Info}) ->
112    process_system_event(State, Node, Pid, TS, Tag, Info);
113
114process_event(State, Node, {TS, starting}) ->
115    FTS = snmp_misc:format_timestamp(TS),
116    info_msg("System Monitor starting on node ~p at ~s", [Node, FTS]),
117    if
118        (Node =/= node()) ->
119            erlang:monitor_node(Node, true);
120        true ->
121            ok
122    end,
123    State;
124
125process_event(State, Node, {TS, stopping}) ->
126    FTS = ?FTS(TS),
127    info_msg("System Monitor stopping on node ~p at ~s", [Node, FTS]),
128    if
129        (Node =/= node()) ->
130            erlang:monitor_node(Node, false);
131        true ->
132            ok
133    end,
134    State;
135
136process_event(State, Node, {TS, already_started}) ->
137    FTS = snmp_misc:format_timestamp(TS),
138    info_msg("System Monitor already started on node ~p at ~s", [Node, FTS]),
139    State;
140
141process_event(State, Node, nodedown) ->
142    info_msg("Node ~p down", [Node]),
143    State;
144
145process_event(State, Node, Event) ->
146    warning_msg("Received unknown event from node ~p:"
147                "~n   ~p", [Node, Event]),
148    State.
149
150
151%% System Monitor events
152%% We only *count* system events
153process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
154                     Node, Pid, TS, long_gc = Ev, Info) ->
155    print_system_event(f("Long GC (~w)", [length(Evs)]), Node, Pid, TS, Info),
156    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
157process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
158                     Node, Pid, TS, long_schedule = Ev, Info) ->
159    print_system_event(f("Long Schedule (~w)", [length(Evs)]), Node, Pid, TS, Info),
160    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
161process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
162                     Node, Pid, TS, large_heap = Ev, Info) ->
163    print_system_event(f("Large Heap (~w)", [length(Evs)]), Node, Pid, TS, Info),
164    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
165process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
166                     Node, Pid, TS, busy_port = Ev, Info) ->
167    print_system_event(f("Busy port (~w)", [length(Evs)]), Node, Pid, TS, Info),
168    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
169process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
170                     Node, Pid, TS, busy_dist_port = Ev, Info) ->
171    print_system_event(f("Busy dist port (~w)", [length(Evs)]),
172                       Node, Pid, TS, Info),
173    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
174
175%% And everything else
176process_system_event(State, Node, Pid, TS, Tag, Info) ->
177    Pre = f("Unknown Event '~p'", [Tag]),
178    print_system_event(Pre, Node, Pid, TS, Info),
179    State.
180
181
182print_system_event(Pre, Node, Pid, TS, Info) ->
183    FTS = snmp_misc:format_timestamp(TS),
184    warning_msg("~s from ~p (~p) at ~s:"
185                "~n   ~p", [Pre, Node, Pid, FTS, Info]).
186
187f(F, A) ->
188    lists:flatten(io_lib:format(F, A)).
189
190
191%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
192
193cast(Msg) ->
194    try global:send(?NAME, {?MODULE, Msg}) of
195        Pid when is_pid(Pid) ->
196            ok
197    catch
198        C:E:_ ->
199            {error, {catched, C, E}}
200    end.
201
202%% call(Req) ->
203%%     call(Req, infinity).
204
205%% call(Req, Timeout) ->
206%%     Ref = make_ref(),
207%%     try global:send(?NAME, {?MODULE, Ref, self(), Req}) of
208%%         Pid when is_pid(Pid) ->
209%%             receive
210%%                 {?MODULE, Ref, Rep} ->
211%%                     Rep
212%%             after Timeout ->
213%%                     {error, timeout}
214%%             end
215%%     catch
216%%         C:E:_ ->
217%%             {error, {catched, C, E}}
218%%     end.
219
220call(Req, Timeout) when (Timeout =:= infinity) ->
221    call(Req, Timeout, Timeout);
222call(Req, Timeout) when is_integer(Timeout) andalso (Timeout > 2000) ->
223    call(Req, Timeout, Timeout - 1000);
224call(Req, Timeout) when is_integer(Timeout) andalso (Timeout > 1000) ->
225    call(Req, Timeout, Timeout - 500);
226call(Req, Timeout) when is_integer(Timeout) ->
227    call(Req, Timeout, Timeout div 2).
228
229%% This peace of wierdness is because on some machines this call has
230%% hung (in a call during end_per_testcase, which had a 1 min timeout,
231%% or if that was the total time for the test case).
232%% But because it hung there, we don't really know what where it git stuck.
233%% So, by making the call in a tmp process, that we supervise, we can
234%% keep control. Also, we change the default timeout from infinity to an
235%% actual time (16 seconds).
236call(Req, Timeout1, Timeout2) ->
237    F = fun() ->
238                Ref = make_ref(),
239                try global:send(?NAME, {?MODULE, Ref, self(), Req}) of
240                    NamePid when is_pid(NamePid) ->
241                        receive
242                            {?MODULE, Ref, Rep} ->
243                                Rep
244                        after Timeout2 ->
245                                {error, timeout}
246                        end
247                catch
248                    C:E:_ ->
249                        {error, {catched, C, E}}
250                end
251        end,
252    {Pid, Mon} = spawn_monitor(F),
253    receive
254        {'DOWN', Mon, process, Pid, Result} ->
255            Result
256    after Timeout1 ->
257            PInfo = process_info(Pid),
258            exit(Pid, kill),
259            {error, {timeout, PInfo}}
260    end.
261
262
263
264%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
265
266info_msg(F, A) ->
267    error_logger:info_msg(format_msg(F, A), []).
268
269warning_msg(F, A) ->
270    error_logger:warning_msg(format_msg(F, A), []).
271
272
273format_msg(F, A) ->
274    f("~n" ++
275          "****** SNMP TEST GLOBAL SYSTEM MONITOR ******~n~n" ++
276          F ++
277          "~n~n",
278      A).
279
280