1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 1996-2018. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20-module(disksup_SUITE).
21-include_lib("common_test/include/ct.hrl").
22
23%% Test server specific exports
24-export([all/0, suite/0]).
25-export([init_per_suite/1, end_per_suite/1]).
26-export([init_per_testcase/2, end_per_testcase/2]).
27
28%% Test cases
29-export([api/1, config/1, alarm/1]).
30-export([port/1]).
31-export([terminate/1, unavailable/1, restart/1]).
32-export([otp_5910/1]).
33-export([posix_only/1, parse_df_output_posix/1, parse_df_output_susv3/1]).
34
35init_per_suite(Config) when is_list(Config) ->
36    ok = application:start(os_mon),
37    Config.
38
39end_per_suite(Config) when is_list(Config) ->
40    ok = application:stop(os_mon),
41    Config.
42
43init_per_testcase(unavailable, Config) ->
44    terminate(Config),
45    init_per_testcase(dummy, Config);
46init_per_testcase(_Case, Config) ->
47    Config.
48
49end_per_testcase(TC, Config) when TC =:= unavailable;
50                                  TC =:= posix_only ->
51    restart(Config),
52    end_per_testcase(dummy, Config);
53end_per_testcase(_Case, _Config) ->
54    ok.
55
56suite() ->
57    [{ct_hooks,[ts_install_cth]},
58     {timetrap,{minutes,1}}].
59
60all() ->
61    Bugs = [otp_5910],
62    Always = [api, config, alarm, port, posix_only, unavailable,
63              parse_df_output_posix, parse_df_output_susv3] ++ Bugs,
64    case test_server:os_type() of
65	{unix, _OSname} -> Always;
66	{win32, _OSname} -> Always;
67	_OS -> [unavailable]
68    end.
69
70%% Test of API functions
71api(Config) when is_list(Config) ->
72
73    %% get_disk_data()
74    ok = check_get_disk_data(),
75
76    %% get_check_interval()
77    1800000 = disksup:get_check_interval(),
78
79    %% set_check_interval(Minutes)
80    ok = disksup:set_check_interval(20),
81    1200000 = disksup:get_check_interval(),
82    {'EXIT',{badarg,_}} = (catch disksup:set_check_interval(0.5)),
83    1200000 = disksup:get_check_interval(),
84    ok = disksup:set_check_interval(30),
85
86    %% get_almost_full_threshold()
87    80 = disksup:get_almost_full_threshold(),
88
89    %% set_almost_full_threshold(Float)
90    ok = disksup:set_almost_full_threshold(0.90),
91    90 = disksup:get_almost_full_threshold(),
92    {'EXIT',{badarg,_}} =
93	(catch disksup:set_almost_full_threshold(-0.5)),
94    90 = disksup:get_almost_full_threshold(),
95    ok = disksup:set_almost_full_threshold(0.80),
96
97    ok.
98
99%% Test configuration
100config(Config) when is_list(Config) ->
101
102    %% Change configuration parameters and make sure change is reflected
103    %% when disksup is restarted
104    ok = application:set_env(os_mon, disk_space_check_interval, 29),
105    ok = application:set_env(os_mon, disk_almost_full_threshold, 0.81),
106
107    ok = supervisor:terminate_child(os_mon_sup, disksup),
108    {ok, _Child1} = supervisor:restart_child(os_mon_sup, disksup),
109
110    1740000 = disksup:get_check_interval(),
111    81 = disksup:get_almost_full_threshold(),
112
113    %% Also try this with bad parameter values, should be ignored
114    ok =
115	application:set_env(os_mon, disk_space_check_interval, 0.5),
116    ok =
117	application:set_env(os_mon, disk_almost_full_threshold, -0.81),
118
119    ok = supervisor:terminate_child(os_mon_sup, disksup),
120    {ok, _Child2} = supervisor:restart_child(os_mon_sup, disksup),
121
122    1800000 = disksup:get_check_interval(),
123    80 = disksup:get_almost_full_threshold(),
124
125    %% Reset configuration parameters
126    ok = application:set_env(os_mon, disk_space_check_interval, 30),
127    ok = application:set_env(os_mon, disk_almost_full_threshold, 0.80),
128    ok.
129
130%%----------------------------------------------------------------------
131%% NOTE: The test case is a bit weak as it will fail if the disk usage
132%% changes too much during its course, or if there are timing problems
133%% with the alarm_handler receiving the alarms too late
134%%----------------------------------------------------------------------
135
136%% Test that alarms are set and cleared
137alarm(Config) when is_list(Config) ->
138
139    %% Find out how many disks exceed the threshold
140    %% and make sure the corresponding number of alarms is set
141    Threshold1 = disksup:get_almost_full_threshold(), % 80
142    Data1 = disksup:get_disk_data(),
143    Over1 = over_threshold(Data1, Threshold1),
144    Alarms1 = get_alarms(),
145    if
146	Over1==length(Alarms1) ->
147	    true;
148	true ->
149	    dump_info(),
150	    ct:fail({bad_alarms, Threshold1, Data1, Alarms1})
151    end,
152
153    %% Try to find a disk with space usage below Threshold1,
154    %% lower the threshold accordingly and make sure new alarms are set
155    Fun1 = fun({_Id, _Kbyte, Capacity}) ->
156		   if
157		       Capacity>0, Capacity<Threshold1 -> true;
158		       true -> false
159		   end
160	   end,
161    case until(Fun1, Data1) of
162	      {_, _, Cap1} ->
163		  Threshold2 = Cap1-1,
164		  ok =
165		      disksup:set_almost_full_threshold(Threshold2/100),
166		  disksup ! timeout, % force a disk check
167		  Data2 = disksup:get_disk_data(),
168		  Over2 = over_threshold(Data2, Threshold2),
169		  Alarms2 = get_alarms(),
170		  if
171		      Over2==length(Alarms2), Over2>Over1 ->
172			  true;
173		      true ->
174			  dump_info(),
175			  ct:fail({bad_alarms, Threshold2, Data2, Alarms2})
176		  end;
177	      false ->
178		  ignore
179	  end,
180
181    %% Find out the highest space usage among all disks
182    %% and try to raise the threshold above this value,
183    %% make sure all alarms are cleared
184    Fun2 = fun({_Id, _Kbyte, Capacity}, MaxAcc) ->
185		   if
186		       Capacity>MaxAcc -> Capacity;
187		       true -> MaxAcc
188		   end
189	   end,
190    case lists:foldl(Fun2, 0, Data1) of
191	      Max when Max<100 ->
192		  Threshold3 = Max+1,
193		  ok = disksup:set_almost_full_threshold(Threshold3/100),
194		  disksup ! timeout, % force a disk check
195		  Data3   = disksup:get_disk_data(),
196		  Over3   = over_threshold(Data3, Threshold3),
197		  Alarms3 = get_alarms(),
198		  if
199		      Over3==0, length(Alarms3)==0 ->
200			  ok;
201		      true ->
202			  dump_info(),
203			  ct:fail({bad_alarms, Threshold3, Data3, Alarms3})
204		  end;
205	      100 ->
206		  ignore
207	  end,
208
209    %% Reset threshold
210    ok = disksup:set_almost_full_threshold(Threshold1/100),
211    ok.
212
213over_threshold(Data, Threshold) ->
214    Data2 = remove_duplicated_disks(lists:keysort(1, Data)),
215    lists:foldl(fun
216	    ({_Id, _Kbyte, Cap}, N) when Cap>=Threshold -> N+1;
217	    (_DiskData, N) -> N
218	end, 0, Data2).
219
220%% On some platforms (for example MontaVista) data for one disk can be
221%% "duplicated":
222%%  Linux ppb 2.4.20_mvl31-pcore680 #1 Sun Feb 1 23:12:56 PST 2004 ppc unknown
223%%
224%%  MontaVista(R) Linux(R) Professional Edition 3.1
225%%
226%%  [ppb:~]> /bin/df -lk
227%%  Filesystem           1k-blocks      Used Available Use% Mounted on
228%%  rootfs                 8066141   3023763   4961717  38% /
229%%  /dev/root              8066141   3023763   4961717  38% /
230%%  tmpfs                   192892         0    192892   0% /dev/shm
231%%
232%% disksup:
233%%  [{"/",8066141,38}, {"/",8066141,38}, {"/dev/shm",192892,0}]
234%%
235%% disksup will only set ONE alarm for "/".
236%% Therefore the list of disk data must be sorted and duplicated disk
237%% tuples removed before calculating how many alarms should be set, or
238%% the testcase will fail erroneously.
239remove_duplicated_disks([{Id, _, _}, {Id, Kbyte, Cap}|T]) ->
240    remove_duplicated_disks([{Id, Kbyte, Cap}|T]);
241remove_duplicated_disks([H|T]) ->
242    [H|remove_duplicated_disks(T)];
243remove_duplicated_disks([]) ->
244    [].
245
246get_alarms() ->
247    lists:filter(fun
248	    ({{disk_almost_full, _Disk},_}) -> true;
249	    (_) -> false
250	end, alarm_handler:get_alarms()).
251
252until(Fun, [H|T]) ->
253    case Fun(H) of
254	true -> H;
255	false -> until(Fun, T)
256    end;
257until(_Fun, []) -> false.
258
259%% Test that disksup handles a terminating port program
260port(Config) when is_list(Config) ->
261    Str = os:cmd("ps -ef | grep '[d]isksup'"),
262    case io_lib:fread("~s ~s", Str) of
263	{ok, [_Uid,Pid], _Rest} ->
264
265	    %% Monitor disksup
266	    MonRef = erlang:monitor(process, disksup),
267	    [{_Disk1,Kbyte1,_Cap1}|_] = disksup:get_disk_data(),
268	    true = Kbyte1>0,
269
270	    %% Kill the port program
271	    case os:cmd("kill -9 " ++ Pid) of
272		[] ->
273
274		    %% disksup should now terminate
275		    receive
276			{'DOWN', MonRef, _, _, {port_died, _Reason}} ->
277			    ok;
278			{'DOWN', MonRef, _, _, Reason} ->
279			    ct:fail({unexpected_exit_reason, Reason})
280		    after
281			3000 ->
282			    ct:fail({still_alive, Str})
283		    end,
284
285		    %% Give os_mon_sup time to restart disksup
286		    ct:sleep({seconds,3}),
287		    [{_Disk2,Kbyte2,_Cap2}|_] = disksup:get_disk_data(),
288		    true = Kbyte2>0,
289
290		    ok;
291
292		Line ->
293		    erlang:demonitor(MonRef),
294		    {skip, {not_killed, Line}}
295	    end;
296	_ ->
297	    {skip, {os_pid_not_found, Str}}
298    end.
299
300terminate(Config) when is_list(Config) ->
301    ok = application:set_env(os_mon, start_disksup, false),
302    ok = supervisor:terminate_child(os_mon_sup, disksup),
303    ok.
304
305%% Test correct behaviour when service is unavailable
306unavailable(Config) when is_list(Config) ->
307
308    %% Make sure all API functions return their dummy values
309    [{"none",0,0}] = disksup:get_disk_data(),
310    1800000 = disksup:get_check_interval(),
311    ok = disksup:set_check_interval(5),
312    80 = disksup:get_almost_full_threshold(),
313    ok = disksup:set_almost_full_threshold(0.9),
314    ok.
315
316restart(Config) when is_list(Config) ->
317    ok = application:set_env(os_mon, start_disksup, true),
318    ok = application:set_env(os_mon, disksup_posix_only, false),
319    case supervisor:restart_child(os_mon_sup, disksup) of
320        {ok, _Pid} -> ok;
321        {error, running} -> ok
322    end.
323
324%% Test that alarms are cleared if disksup crashes or
325%% if OS_Mon is stopped
326otp_5910(Config) when is_list(Config) ->
327
328    %% Make sure disksup sets at least one alarm
329    Data = lists:sort(disksup:get_disk_data()),
330    Threshold0 = disksup:get_almost_full_threshold(),
331    Threshold  = case over_threshold(Data, Threshold0) of
332		     0 ->
333			 [{_Id,_Kbyte,Cap}|_] = Data,
334			 io:format("Data ~p Threshold ~p ~n",[Data, Cap-1]),
335			 ok = disksup:set_almost_full_threshold((Cap-1)/100),
336			 Cap-1;
337		     _N -> Threshold0
338		 end,
339    ok = application:set_env(os_mon, disk_almost_full_threshold, Threshold/100),
340    disksup ! timeout, % force a disk check
341    Data2 = disksup:get_disk_data(),
342    Over = over_threshold(Data2, Threshold),
343    Alarms = get_alarms(),
344    if
345	Over==0 ->
346	    ct:fail({threshold_too_low, Data2, Threshold});
347	Over==length(Alarms) ->
348	    ok;
349	true ->
350	    dump_info(),
351	    ct:fail({bad_alarms, Threshold, Data2, Alarms})
352    end,
353
354    %% Kill disksup
355    exit(whereis(disksup), faked_disksup_crash),
356
357    %% Wait a little to make sure disksup has been restarted,
358    %% then make sure the alarms are set once, but not twice
359    ct:sleep({seconds,1}),
360    Data3   = disksup:get_disk_data(),
361    Alarms2 = get_alarms(),
362    if
363	length(Alarms2)==length(Alarms) -> ok;
364	true ->
365	    dump_info(),
366	    ct:fail({bad_alarms,Threshold,Data3,Alarms,Alarms2})
367    end,
368
369    %% Stop OS_Mon and make sure all disksup alarms are cleared
370    ok = application:stop(os_mon),
371    ct:sleep({seconds,1}),
372    Alarms3 = get_alarms(),
373    case get_alarms() of
374	[] -> ok;
375	_  -> ct:fail({alarms_not_cleared, Alarms3})
376    end,
377
378    %% Reset threshold and restart OS_Mon
379    ok = application:set_env(os_mon, disksup_almost_full_threshold, 0.8),
380    ok = disksup:set_almost_full_threshold(0.8),
381    ok = application:start(os_mon),
382    ok.
383
384%% Test disksup_posix_only option
385posix_only(Config) when is_list(Config) ->
386    %% Set option and restart disksup
387    ok = application:set_env(os_mon, disksup_posix_only, true),
388    ok = supervisor:terminate_child(os_mon_sup, disksup),
389    {ok, _Child1} = supervisor:restart_child(os_mon_sup, disksup),
390
391    ok = check_get_disk_data().
392
393dump_info() ->
394    io:format("Status: ~p~n", [sys:get_status(disksup)]).
395
396check_get_disk_data() ->
397    [{Id,KByte,Capacity}|_] = get_disk_data(),
398    true = io_lib:printable_list(Id),
399    true = is_integer(KByte),
400    true = is_integer(Capacity),
401    true = Capacity>0,
402    true = KByte>0,
403    ok.
404
405% filter get_disk_data and remove entriew with zero capacity
406% "non-normal" filesystems report zero capacity
407% - Perhaps errorneous 'df -k -l'?
408% - Always list filesystems by type '-t ufs,zfs,..' instead?
409% It is unclear what the intention was from the beginning.
410get_disk_data() ->
411    get_disk_data(disksup:get_disk_data()).
412
413get_disk_data([{"none",0,0}=E]) -> [E];
414get_disk_data([{_,_,0}|Es]) -> get_disk_data(Es);
415get_disk_data([E|Es]) -> [E|get_disk_data(Es)];
416get_disk_data([]) -> [].
417
418%% @doc Test various expected inputs to 'df' command output (Linux/POSIX)
419parse_df_output_posix(Config) when is_list(Config) ->
420    PosixHdr = "Filesystem     1K-blocks     Used Available Use% Mounted on\n",
421    {error, _} = disksup:parse_df(PosixHdr, posix),
422    {error, _} = disksup:parse_df("", posix),
423    {error, _} = disksup:parse_df("\n\n", posix),
424
425    %% Have a simple example with no funny spaces in mount path
426    Posix1 = "tmpfs             498048     7288    490760   2% /run\n",
427    {ok, {498048, 2, "/run"}, ""} = disksup:parse_df(Posix1, posix),
428
429    %% Have a mount path with some spaces in it
430    Posix2 = "tmpfs             498048     7288    490760   2% /spaces 1 2\n",
431    {ok, {498048, 2, "/spaces 1 2"}, ""} = disksup:parse_df(Posix2, posix).
432
433%% @doc Test various expected inputs to 'df' command output (Darwin/SUSv3)
434parse_df_output_susv3(Config) when is_list(Config) ->
435    DarwinHdr = "Filesystem 1024-blocks      Used Available Capacity " ++
436                "iused      ifree %iused  Mounted on",
437    {error, _} = disksup:parse_df(DarwinHdr, susv3),
438    {error, _} = disksup:parse_df("", susv3),
439    {error, _} = disksup:parse_df("\n\n", susv3),
440
441    %% Have a simple example with no funny spaces in mount path
442    Darwin1 = "/dev/disk1   243949060 157002380  86690680    65% 2029724 " ++
443              "4292937555    0%   /\n",
444    {ok, {243949060, 65, "/"}, ""} = disksup:parse_df(Darwin1, susv3),
445
446    %% Have a mount path with some spaces in it
447    Darwin2 = "/dev/disk1   243949060 157002380  86690680    65% 2029724 " ++
448              "4292937555    0%   /spaces 1 2\n",
449    {ok, {243949060, 65, "/spaces 1 2"}, ""} = disksup:parse_df(Darwin2, susv3).
450