1// Copyright 2018 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14package blockdevice
15
16import (
17	"bufio"
18	"fmt"
19	"github.com/prometheus/procfs/internal/util"
20	"io"
21	"io/ioutil"
22	"os"
23	"strings"
24
25	"github.com/prometheus/procfs/internal/fs"
26)
27
28// Info contains identifying information for a block device such as a disk drive
29type Info struct {
30	MajorNumber uint32
31	MinorNumber uint32
32	DeviceName  string
33}
34
35// IOStats models the iostats data described in the kernel documentation
36// https://www.kernel.org/doc/Documentation/iostats.txt,
37// https://www.kernel.org/doc/Documentation/block/stat.txt,
38// and https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats
39type IOStats struct {
40	// ReadIOs is the number of reads completed successfully.
41	ReadIOs uint64
42	// ReadMerges is the number of reads merged.  Reads and writes
43	// which are adjacent to each other may be merged for efficiency.
44	ReadMerges uint64
45	// ReadSectors is the total number of sectors read successfully.
46	ReadSectors uint64
47	// ReadTicks is the total number of milliseconds spent by all reads.
48	ReadTicks uint64
49	// WriteIOs is the total number of writes completed successfully.
50	WriteIOs uint64
51	// WriteMerges is the number of reads merged.
52	WriteMerges uint64
53	// WriteSectors is the total number of sectors written successfully.
54	WriteSectors uint64
55	// WriteTicks is the total number of milliseconds spent by all writes.
56	WriteTicks uint64
57	// IOsInProgress is number of I/Os currently in progress.
58	IOsInProgress uint64
59	// IOsTotalTicks is the number of milliseconds spent doing I/Os.
60	// This field increases so long as IosInProgress is nonzero.
61	IOsTotalTicks uint64
62	// WeightedIOTicks is the weighted number of milliseconds spent doing I/Os.
63	// This can also be used to estimate average queue wait time for requests.
64	WeightedIOTicks uint64
65	// DiscardIOs is the total number of discards completed successfully.
66	DiscardIOs uint64
67	// DiscardMerges is the number of discards merged.
68	DiscardMerges uint64
69	// DiscardSectors is the total number of sectors discarded successfully.
70	DiscardSectors uint64
71	// DiscardTicks is the total number of milliseconds spent by all discards.
72	DiscardTicks uint64
73	// FlushRequestsCompleted is the total number of flush request completed successfully.
74	FlushRequestsCompleted uint64
75	// TimeSpentFlushing is the total number of milliseconds spent flushing.
76	TimeSpentFlushing uint64
77}
78
79// Diskstats combines the device Info and IOStats
80type Diskstats struct {
81	Info
82	IOStats
83	// IoStatsCount contains the number of io stats read. For kernel versions 5.5+,
84	// there should be 20 fields read. For kernel versions 4.18+,
85	// there should be 18 fields read. For earlier kernel versions this
86	// will be 14 because the discard values are not available.
87	IoStatsCount int
88}
89
90// BlockQueueStats models the queue files that are located in the sysfs tree for each block device
91// and described in the kernel documentation:
92// https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt
93// https://www.kernel.org/doc/html/latest/block/queue-sysfs.html
94type BlockQueueStats struct {
95	// AddRandom is the status of a disk entropy (1 is on, 0 is off).
96	AddRandom uint64
97	// Dax indicates whether the device supports Direct Access (DAX) (1 is on, 0 is off).
98	DAX uint64
99	// DiscardGranularity is the size of internal allocation of the device in bytes, 0 means device
100	// does not support the discard functionality.
101	DiscardGranularity uint64
102	// DiscardMaxHWBytes is the hardware maximum number of bytes that can be discarded in a single operation,
103	// 0 means device does not support the discard functionality.
104	DiscardMaxHWBytes uint64
105	// DiscardMaxBytes is the software maximum number of bytes that can be discarded in a single operation.
106	DiscardMaxBytes uint64
107	// HWSectorSize is the sector size of the device, in bytes.
108	HWSectorSize uint64
109	// IOPoll indicates if polling is enabled (1 is on, 0 is off).
110	IOPoll uint64
111	// IOPollDelay indicates how polling will be performed, -1 for classic polling, 0 for hybrid polling,
112	// with greater than 0 the kernel will put process issuing IO to sleep for this amount of time in
113	// microseconds before entering classic polling.
114	IOPollDelay int64
115	// IOTimeout is the request timeout in milliseconds.
116	IOTimeout uint64
117	// IOStats indicates if iostats accounting is used for the disk (1 is on, 0 is off).
118	IOStats uint64
119	// LogicalBlockSize is the logical block size of the device, in bytes.
120	LogicalBlockSize uint64
121	// MaxHWSectorsKB is the maximum number of kilobytes supported in a single data transfer.
122	MaxHWSectorsKB uint64
123	// MaxIntegritySegments is the max limit of integrity segments as set by block layer which a hardware controller
124	// can handle.
125	MaxIntegritySegments uint64
126	// MaxSectorsKB is the maximum number of kilobytes that the block layer will allow for a filesystem request.
127	MaxSectorsKB uint64
128	// MaxSegments is the number of segments on the device.
129	MaxSegments uint64
130	// MaxSegmentsSize is the maximum segment size of the device.
131	MaxSegmentSize uint64
132	// MinimumIOSize is the smallest preferred IO size reported by the device.
133	MinimumIOSize uint64
134	// NoMerges shows the lookup logic involved with IO merging requests in the block layer. 0 all merges are
135	// enabled, 1 only simple one hit merges are tried, 2 no merge algorithms will be tried.
136	NoMerges uint64
137	// NRRequests is the number of how many requests may be allocated in the block layer for read or write requests.
138	NRRequests uint64
139	// OptimalIOSize is the optimal IO size reported by the device.
140	OptimalIOSize uint64
141	// PhysicalBlockSize is the physical block size of device, in bytes.
142	PhysicalBlockSize uint64
143	// ReadAHeadKB is the maximum number of kilobytes to read-ahead for filesystems on this block device.
144	ReadAHeadKB uint64
145	// Rotational indicates if the device is of rotational type or non-rotational type.
146	Rotational uint64
147	// RQAffinity indicates affinity policy of device, if 1 the block layer will migrate request completions to the
148	// cpu “group” that originally submitted the request, if 2 forces the completion to run on the requesting cpu.
149	RQAffinity uint64
150	// SchedulerList contains list of available schedulers for this block device.
151	SchedulerList []string
152	// SchedulerCurrent is the current scheduler for this block device.
153	SchedulerCurrent string
154	// WriteCache shows the type of cache for block device, "write back" or "write through".
155	WriteCache string
156	// WriteSameMaxBytes is the number of bytes the device can write in a single write-same command.
157	// A value of ‘0’ means write-same is not supported by this device.
158	WriteSameMaxBytes uint64
159	// WBTLatUSec is the target minimum read latency, 0 means feature is disables.
160	WBTLatUSec int64
161	// ThrottleSampleTime is the time window that blk-throttle samples data, in millisecond. Optional
162	// exists only if CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
163	ThrottleSampleTime *uint64
164	// Zoned indicates if the device is a zoned block device and the zone model of the device if it is indeed zoned.
165	// Possible values are: none, host-aware, host-managed for zoned block devices.
166	Zoned string
167	// NRZones indicates the total number of zones of the device, always zero for regular block devices.
168	NRZones uint64
169	// ChunksSectors for RAID is the size in 512B sectors of the RAID volume stripe segment,
170	// for zoned host device is the size in 512B sectors.
171	ChunkSectors uint64
172	// FUA indicates whether the device supports Force Unit Access for write requests.
173	FUA uint64
174	// MaxDiscardSegments is the maximum number of DMA entries in a discard request.
175	MaxDiscardSegments uint64
176	// WriteZeroesMaxBytes the maximum number of bytes that can be zeroed at once.
177	// The value 0 means that REQ_OP_WRITE_ZEROES is not supported.
178	WriteZeroesMaxBytes uint64
179}
180
181const (
182	procDiskstatsPath   = "diskstats"
183	procDiskstatsFormat = "%d %d %s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d"
184	sysBlockPath        = "block"
185	sysBlockStatFormat  = "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d"
186	sysBlockQueue       = "queue"
187)
188
189// FS represents the pseudo-filesystems proc and sys, which provides an
190// interface to kernel data structures.
191type FS struct {
192	proc *fs.FS
193	sys  *fs.FS
194}
195
196// NewDefaultFS returns a new blockdevice fs using the default mountPoints for proc and sys.
197// It will error if either of these mount points can't be read.
198func NewDefaultFS() (FS, error) {
199	return NewFS(fs.DefaultProcMountPoint, fs.DefaultSysMountPoint)
200}
201
202// NewFS returns a new blockdevice fs using the given mountPoints for proc and sys.
203// It will error if either of these mount points can't be read.
204func NewFS(procMountPoint string, sysMountPoint string) (FS, error) {
205	if strings.TrimSpace(procMountPoint) == "" {
206		procMountPoint = fs.DefaultProcMountPoint
207	}
208	procfs, err := fs.NewFS(procMountPoint)
209	if err != nil {
210		return FS{}, err
211	}
212	if strings.TrimSpace(sysMountPoint) == "" {
213		sysMountPoint = fs.DefaultSysMountPoint
214	}
215	sysfs, err := fs.NewFS(sysMountPoint)
216	if err != nil {
217		return FS{}, err
218	}
219	return FS{&procfs, &sysfs}, nil
220}
221
222// ProcDiskstats reads the diskstats file and returns
223// an array of Diskstats (one per line/device)
224func (fs FS) ProcDiskstats() ([]Diskstats, error) {
225	file, err := os.Open(fs.proc.Path(procDiskstatsPath))
226	if err != nil {
227		return nil, err
228	}
229	defer file.Close()
230
231	diskstats := []Diskstats{}
232	scanner := bufio.NewScanner(file)
233	for scanner.Scan() {
234		d := &Diskstats{}
235		d.IoStatsCount, err = fmt.Sscanf(scanner.Text(), procDiskstatsFormat,
236			&d.MajorNumber,
237			&d.MinorNumber,
238			&d.DeviceName,
239			&d.ReadIOs,
240			&d.ReadMerges,
241			&d.ReadSectors,
242			&d.ReadTicks,
243			&d.WriteIOs,
244			&d.WriteMerges,
245			&d.WriteSectors,
246			&d.WriteTicks,
247			&d.IOsInProgress,
248			&d.IOsTotalTicks,
249			&d.WeightedIOTicks,
250			&d.DiscardIOs,
251			&d.DiscardMerges,
252			&d.DiscardSectors,
253			&d.DiscardTicks,
254			&d.FlushRequestsCompleted,
255			&d.TimeSpentFlushing,
256		)
257		// The io.EOF error can be safely ignored because it just means we read fewer than
258		// the full 20 fields.
259		if err != nil && err != io.EOF {
260			return diskstats, err
261		}
262		if d.IoStatsCount >= 14 {
263			diskstats = append(diskstats, *d)
264		}
265	}
266	return diskstats, scanner.Err()
267}
268
269// SysBlockDevices lists the device names from /sys/block/<dev>
270func (fs FS) SysBlockDevices() ([]string, error) {
271	deviceDirs, err := ioutil.ReadDir(fs.sys.Path(sysBlockPath))
272	if err != nil {
273		return nil, err
274	}
275	devices := []string{}
276	for _, deviceDir := range deviceDirs {
277		if deviceDir.IsDir() {
278			devices = append(devices, deviceDir.Name())
279		}
280	}
281	return devices, nil
282}
283
284// SysBlockDeviceStat returns stats for the block device read from /sys/block/<device>/stat.
285// The number of stats read will be 15 if the discard stats are available (kernel 4.18+)
286// and 11 if they are not available.
287func (fs FS) SysBlockDeviceStat(device string) (IOStats, int, error) {
288	stat := IOStats{}
289	bytes, err := ioutil.ReadFile(fs.sys.Path(sysBlockPath, device, "stat"))
290	if err != nil {
291		return stat, 0, err
292	}
293	count, err := fmt.Sscanf(strings.TrimSpace(string(bytes)), sysBlockStatFormat,
294		&stat.ReadIOs,
295		&stat.ReadMerges,
296		&stat.ReadSectors,
297		&stat.ReadTicks,
298		&stat.WriteIOs,
299		&stat.WriteMerges,
300		&stat.WriteSectors,
301		&stat.WriteTicks,
302		&stat.IOsInProgress,
303		&stat.IOsTotalTicks,
304		&stat.WeightedIOTicks,
305		&stat.DiscardIOs,
306		&stat.DiscardMerges,
307		&stat.DiscardSectors,
308		&stat.DiscardTicks,
309	)
310	// An io.EOF error is ignored because it just means we read fewer than the full 15 fields.
311	if err == io.EOF {
312		return stat, count, nil
313	}
314	return stat, count, err
315}
316
317// SysBlockDeviceQueueStats returns stats for /sys/block/xxx/queue where xxx is a device name.
318func (fs FS) SysBlockDeviceQueueStats(device string) (BlockQueueStats, error) {
319	stat := BlockQueueStats{}
320	// files with uint64 fields
321	for file, p := range map[string]*uint64{
322		"add_random":             &stat.AddRandom,
323		"dax":                    &stat.DAX,
324		"discard_granularity":    &stat.DiscardGranularity,
325		"discard_max_hw_bytes":   &stat.DiscardMaxHWBytes,
326		"discard_max_bytes":      &stat.DiscardMaxBytes,
327		"hw_sector_size":         &stat.HWSectorSize,
328		"io_poll":                &stat.IOPoll,
329		"io_timeout":             &stat.IOTimeout,
330		"iostats":                &stat.IOStats,
331		"logical_block_size":     &stat.LogicalBlockSize,
332		"max_hw_sectors_kb":      &stat.MaxHWSectorsKB,
333		"max_integrity_segments": &stat.MaxIntegritySegments,
334		"max_sectors_kb":         &stat.MaxSectorsKB,
335		"max_segments":           &stat.MaxSegments,
336		"max_segment_size":       &stat.MaxSegmentSize,
337		"minimum_io_size":        &stat.MinimumIOSize,
338		"nomerges":               &stat.NoMerges,
339		"nr_requests":            &stat.NRRequests,
340		"optimal_io_size":        &stat.OptimalIOSize,
341		"physical_block_size":    &stat.PhysicalBlockSize,
342		"read_ahead_kb":          &stat.ReadAHeadKB,
343		"rotational":             &stat.Rotational,
344		"rq_affinity":            &stat.RQAffinity,
345		"write_same_max_bytes":   &stat.WriteSameMaxBytes,
346		"nr_zones":               &stat.NRZones,
347		"chunk_sectors":          &stat.ChunkSectors,
348		"fua":                    &stat.FUA,
349		"max_discard_segments":   &stat.MaxDiscardSegments,
350		"write_zeroes_max_bytes": &stat.WriteZeroesMaxBytes,
351	} {
352		val, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file))
353		if err != nil {
354			return BlockQueueStats{}, err
355		}
356		*p = val
357	}
358	// files with int64 fields
359	for file, p := range map[string]*int64{
360		"io_poll_delay": &stat.IOPollDelay,
361		"wbt_lat_usec":  &stat.WBTLatUSec,
362	} {
363		val, err := util.ReadIntFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file))
364		if err != nil {
365			return BlockQueueStats{}, err
366		}
367		*p = val
368	}
369	// files with string fields
370	for file, p := range map[string]*string{
371		"write_cache": &stat.WriteCache,
372		"zoned":       &stat.Zoned,
373	} {
374		val, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file))
375		if err != nil {
376			return BlockQueueStats{}, err
377		}
378		*p = val
379	}
380	scheduler, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, "scheduler"))
381	if err != nil {
382		return BlockQueueStats{}, err
383	}
384	var schedulers []string
385	xs := strings.Split(scheduler, " ")
386	for _, s := range xs {
387		if strings.HasPrefix(s, "[") && strings.HasSuffix(s, "]") {
388			s = s[1 : len(s)-1]
389			stat.SchedulerCurrent = s
390		}
391		schedulers = append(schedulers, s)
392	}
393	stat.SchedulerList = schedulers
394	// optional
395	throttleSampleTime, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, "throttle_sample_time"))
396	if err == nil {
397		stat.ThrottleSampleTime = &throttleSampleTime
398	}
399	return stat, nil
400}
401