1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2021-2021. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21-module(kernel_test_global_sys_monitor).
22
23-export([start/0, stop/0,
24         reset_events/0,
25         events/0, events/1,
26         log/1]).
27-export([timestamp/0, format_timestamp/1]).
28-export([init/1]).
29
30-include("kernel_test_lib.hrl").
31
32-define(NAME,    ?MODULE).
33-define(TIMEOUT, timer:seconds(6)).
34
35
36%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
37
38start() ->
39    Parent = self(),
40    proc_lib:start(?MODULE, init, [Parent]).
41
42stop() ->
43    cast(stop).
44
45%% This does not reset the global counter but the "collector"
46%% See events for more info.
47reset_events() ->
48    call(reset_events, ?TIMEOUT).
49
50events() ->
51    events(infinity).
52
53events(Timeout) when (Timeout =:= infinity) orelse
54                     (is_integer(Timeout) andalso (Timeout > 0)) ->
55    call(events, Timeout).
56
57log(Event) ->
58    cast({node(), Event}).
59
60
61%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
62
63timestamp() ->
64    erlang:timestamp().
65
66format_timestamp({_N1, _N2, N3} = TS) ->
67    {Date, Time}   = calendar:now_to_local_time(TS),
68    {YYYY,MM,DD}   = Date,
69    {Hour,Min,Sec} = Time,
70    FormatDate =
71        io_lib:format("~.4w-~.2.0w-~.2.0w ~.2.0w:~.2.0w:~.2.0w.~.3.0w",
72                      [YYYY, MM, DD, Hour, Min, Sec, N3 div 1000]),
73    lists:flatten(FormatDate).
74
75
76%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
77
78init(Parent) ->
79    process_flag(priority, high),
80    case global:register_name(?NAME, self()) of
81        yes ->
82            info_msg("Starting as ~p (on ~p)", [self(), node()]),
83            proc_lib:init_ack(Parent, {ok, self()}),
84            loop(#{parent => Parent, ev_cnt => 0, evs => []});
85        no ->
86            warning_msg("Already started", []),
87            proc_lib:init_ack(Parent, {error, already_started}),
88            exit(normal)
89    end.
90
91
92%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
93
94loop(State) ->
95    receive
96        {?MODULE, stop} ->
97            warning_msg("Stopping with ~w events counted",
98                        [maps:get(ev_cnt, State)]),
99            exit(normal);
100
101        {?MODULE, Ref, From, reset_events} ->
102            TotEvCnt = maps:get(ev_cnt, State),
103            EvCnt    = length(maps:get(evs, State)),
104            info_msg("Reset events when"
105                     "~n   Total Number of Events:   ~p"
106                     "~n   Current Number of Events: ~p",
107                     [TotEvCnt, EvCnt]),
108            From ! {?MODULE, Ref, {ok, {TotEvCnt, EvCnt}}},
109            loop(State#{evs => []});
110
111        {?MODULE, Ref, From, events} ->
112            Evs = maps:get(evs, State),
113            From ! {?MODULE, Ref, lists:reverse(Evs)},
114            loop(State);
115
116        {?MODULE, {Node, Event}} ->
117            State2 = process_event(State, Node, Event),
118            loop(State2);
119
120        {nodedown = Event, Node} ->
121            State2 = process_event(State, Node, Event),
122            loop(State2);
123
124        _ ->
125            loop(State)
126    end.
127
128
129%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
130
131process_event(State, Node, {Pid, TS, Tag, Info}) ->
132    process_system_event(State, Node, Pid, TS, Tag, Info);
133
134process_event(State, Node, {TS, starting}) ->
135    FTS = format_timestamp(TS),
136    info_msg("System Monitor starting on node ~p at ~s", [Node, FTS]),
137    if
138        (Node =/= node()) ->
139            erlang:monitor_node(Node, true);
140        true ->
141            ok
142    end,
143    State;
144
145process_event(State, Node, {TS, stopping}) ->
146    FTS = format_timestamp(TS),
147    info_msg("System Monitor stopping on node ~p at ~s", [Node, FTS]),
148    if
149        (Node =/= node()) ->
150            erlang:monitor_node(Node, false);
151        true ->
152            ok
153    end,
154    State;
155
156process_event(State, Node, {TS, already_started}) ->
157    FTS = format_timestamp(TS),
158    info_msg("System Monitor already started on node ~p at ~s", [Node, FTS]),
159    State;
160
161process_event(State, Node, nodedown) ->
162    info_msg("Node ~p down", [Node]),
163    State;
164
165process_event(State, Node, Event) ->
166    warning_msg("Received unknown event from node ~p:"
167                "~n   ~p", [Node, Event]),
168    State.
169
170
171%% System Monitor events
172%% We only *count* system events
173process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
174                     Node, Pid, TS, long_gc = Ev, Info) ->
175    print_system_event(f("Long GC (~w)", [length(Evs)]), Node, Pid, TS, Info),
176    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
177process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
178                     Node, Pid, TS, long_schedule = Ev, Info) ->
179    print_system_event(f("Long Schedule (~w)", [length(Evs)]), Node, Pid, TS, Info),
180    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
181process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
182                     Node, Pid, TS, large_heap = Ev, Info) ->
183    print_system_event(f("Large Heap (~w)", [length(Evs)]), Node, Pid, TS, Info),
184    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
185process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
186                     Node, Pid, TS, busy_port = Ev, Info) ->
187    print_system_event(f("Busy port (~w)", [length(Evs)]), Node, Pid, TS, Info),
188    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
189process_system_event(#{ev_cnt := Cnt, evs := Evs} = State,
190                     Node, Pid, TS, busy_dist_port = Ev, Info) ->
191    print_system_event(f("Busy dist port (~w)", [length(Evs)]),
192                       Node, Pid, TS, Info),
193    State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]};
194
195%% And everything else
196process_system_event(State, Node, Pid, TS, Tag, Info) ->
197    Pre = f("Unknown Event '~p'", [Tag]),
198    print_system_event(Pre, Node, Pid, TS, Info),
199    State.
200
201
202print_system_event(Pre, Node, Pid, TS, Info) ->
203    FTS = format_timestamp(TS),
204    warning_msg("~s from ~p (~p) at ~s:"
205                "~n   ~p", [Pre, Node, Pid, FTS, Info]).
206
207f(F, A) ->
208    lists:flatten(io_lib:format(F, A)).
209
210
211%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
212
213cast(Msg) ->
214    try global:send(?NAME, {?MODULE, Msg}) of
215        Pid when is_pid(Pid) ->
216            ok
217    catch
218        C:E:_ ->
219            {error, {catched, C, E}}
220    end.
221
222
223call(Req, Timeout) when (Timeout =:= infinity) ->
224    call(Req, Timeout, Timeout);
225call(Req, Timeout) when is_integer(Timeout) andalso (Timeout > 2000) ->
226    call(Req, Timeout, Timeout - 1000);
227call(Req, Timeout) when is_integer(Timeout) andalso (Timeout > 1000) ->
228    call(Req, Timeout, Timeout - 500);
229call(Req, Timeout) when is_integer(Timeout) ->
230    call(Req, Timeout, Timeout div 2).
231
232%% This peace of wierdness is because on some machines this call has
233%% hung (in a call during end_per_testcase, which had a 1 min timeout,
234%% or if that was the total time for the test case).
235%% But because it hung there, we don't really know what where it git stuck.
236%% So, by making the call in a tmp process, that we supervise, we can
237%% keep control. Also, we change the default timeout from infinity to an
238%% actual time (16 seconds).
239call(Req, Timeout1, Timeout2) ->
240    F = fun() ->
241                Ref = make_ref(),
242                try global:send(?NAME, {?MODULE, Ref, self(), Req}) of
243                    NamePid when is_pid(NamePid) ->
244                        receive
245                            {?MODULE, Ref, Rep} ->
246                                exit(Rep)
247                        after Timeout2 ->
248                                {error, timeout}
249                        end
250                catch
251                    C:E:_ ->
252                        {error, {catched, C, E}}
253                end
254        end,
255    {Pid, Mon} = spawn_monitor(F),
256    receive
257        {'DOWN', Mon, process, Pid, Result} ->
258            Result
259    after Timeout1 ->
260            PInfo = process_info(Pid),
261            exit(Pid, kill),
262            {error, {timeout, PInfo}}
263    end.
264
265
266
267%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
268
269info_msg(F, A) ->
270    error_logger:info_msg(format_msg(F, A), []).
271
272warning_msg(F, A) ->
273    error_logger:warning_msg(format_msg(F, A), []).
274
275
276format_msg(F, A) ->
277    f("~n" ++
278          "****** KERNEL TEST GLOBAL SYSTEM MONITOR ******~n~n" ++
279          F ++
280          "~n~n",
281      A).
282
283