1// Copyright 2018 The Prometheus Authors 2// Licensed under the Apache License, Version 2.0 (the "License"); 3// you may not use this file except in compliance with the License. 4// You may obtain a copy of the License at 5// 6// http://www.apache.org/licenses/LICENSE-2.0 7// 8// Unless required by applicable law or agreed to in writing, software 9// distributed under the License is distributed on an "AS IS" BASIS, 10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11// See the License for the specific language governing permissions and 12// limitations under the License. 13 14package blockdevice 15 16import ( 17 "bufio" 18 "fmt" 19 "github.com/prometheus/procfs/internal/util" 20 "io" 21 "io/ioutil" 22 "os" 23 "strings" 24 25 "github.com/prometheus/procfs/internal/fs" 26) 27 28// Info contains identifying information for a block device such as a disk drive 29type Info struct { 30 MajorNumber uint32 31 MinorNumber uint32 32 DeviceName string 33} 34 35// IOStats models the iostats data described in the kernel documentation 36// https://www.kernel.org/doc/Documentation/iostats.txt, 37// https://www.kernel.org/doc/Documentation/block/stat.txt, 38// and https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats 39type IOStats struct { 40 // ReadIOs is the number of reads completed successfully. 41 ReadIOs uint64 42 // ReadMerges is the number of reads merged. Reads and writes 43 // which are adjacent to each other may be merged for efficiency. 44 ReadMerges uint64 45 // ReadSectors is the total number of sectors read successfully. 46 ReadSectors uint64 47 // ReadTicks is the total number of milliseconds spent by all reads. 48 ReadTicks uint64 49 // WriteIOs is the total number of writes completed successfully. 50 WriteIOs uint64 51 // WriteMerges is the number of reads merged. 52 WriteMerges uint64 53 // WriteSectors is the total number of sectors written successfully. 54 WriteSectors uint64 55 // WriteTicks is the total number of milliseconds spent by all writes. 56 WriteTicks uint64 57 // IOsInProgress is number of I/Os currently in progress. 58 IOsInProgress uint64 59 // IOsTotalTicks is the number of milliseconds spent doing I/Os. 60 // This field increases so long as IosInProgress is nonzero. 61 IOsTotalTicks uint64 62 // WeightedIOTicks is the weighted number of milliseconds spent doing I/Os. 63 // This can also be used to estimate average queue wait time for requests. 64 WeightedIOTicks uint64 65 // DiscardIOs is the total number of discards completed successfully. 66 DiscardIOs uint64 67 // DiscardMerges is the number of discards merged. 68 DiscardMerges uint64 69 // DiscardSectors is the total number of sectors discarded successfully. 70 DiscardSectors uint64 71 // DiscardTicks is the total number of milliseconds spent by all discards. 72 DiscardTicks uint64 73 // FlushRequestsCompleted is the total number of flush request completed successfully. 74 FlushRequestsCompleted uint64 75 // TimeSpentFlushing is the total number of milliseconds spent flushing. 76 TimeSpentFlushing uint64 77} 78 79// Diskstats combines the device Info and IOStats 80type Diskstats struct { 81 Info 82 IOStats 83 // IoStatsCount contains the number of io stats read. For kernel versions 5.5+, 84 // there should be 20 fields read. For kernel versions 4.18+, 85 // there should be 18 fields read. For earlier kernel versions this 86 // will be 14 because the discard values are not available. 87 IoStatsCount int 88} 89 90// BlockQueueStats models the queue files that are located in the sysfs tree for each block device 91// and described in the kernel documentation: 92// https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt 93// https://www.kernel.org/doc/html/latest/block/queue-sysfs.html 94type BlockQueueStats struct { 95 // AddRandom is the status of a disk entropy (1 is on, 0 is off). 96 AddRandom uint64 97 // Dax indicates whether the device supports Direct Access (DAX) (1 is on, 0 is off). 98 DAX uint64 99 // DiscardGranularity is the size of internal allocation of the device in bytes, 0 means device 100 // does not support the discard functionality. 101 DiscardGranularity uint64 102 // DiscardMaxHWBytes is the hardware maximum number of bytes that can be discarded in a single operation, 103 // 0 means device does not support the discard functionality. 104 DiscardMaxHWBytes uint64 105 // DiscardMaxBytes is the software maximum number of bytes that can be discarded in a single operation. 106 DiscardMaxBytes uint64 107 // HWSectorSize is the sector size of the device, in bytes. 108 HWSectorSize uint64 109 // IOPoll indicates if polling is enabled (1 is on, 0 is off). 110 IOPoll uint64 111 // IOPollDelay indicates how polling will be performed, -1 for classic polling, 0 for hybrid polling, 112 // with greater than 0 the kernel will put process issuing IO to sleep for this amount of time in 113 // microseconds before entering classic polling. 114 IOPollDelay int64 115 // IOTimeout is the request timeout in milliseconds. 116 IOTimeout uint64 117 // IOStats indicates if iostats accounting is used for the disk (1 is on, 0 is off). 118 IOStats uint64 119 // LogicalBlockSize is the logical block size of the device, in bytes. 120 LogicalBlockSize uint64 121 // MaxHWSectorsKB is the maximum number of kilobytes supported in a single data transfer. 122 MaxHWSectorsKB uint64 123 // MaxIntegritySegments is the max limit of integrity segments as set by block layer which a hardware controller 124 // can handle. 125 MaxIntegritySegments uint64 126 // MaxSectorsKB is the maximum number of kilobytes that the block layer will allow for a filesystem request. 127 MaxSectorsKB uint64 128 // MaxSegments is the number of segments on the device. 129 MaxSegments uint64 130 // MaxSegmentsSize is the maximum segment size of the device. 131 MaxSegmentSize uint64 132 // MinimumIOSize is the smallest preferred IO size reported by the device. 133 MinimumIOSize uint64 134 // NoMerges shows the lookup logic involved with IO merging requests in the block layer. 0 all merges are 135 // enabled, 1 only simple one hit merges are tried, 2 no merge algorithms will be tried. 136 NoMerges uint64 137 // NRRequests is the number of how many requests may be allocated in the block layer for read or write requests. 138 NRRequests uint64 139 // OptimalIOSize is the optimal IO size reported by the device. 140 OptimalIOSize uint64 141 // PhysicalBlockSize is the physical block size of device, in bytes. 142 PhysicalBlockSize uint64 143 // ReadAHeadKB is the maximum number of kilobytes to read-ahead for filesystems on this block device. 144 ReadAHeadKB uint64 145 // Rotational indicates if the device is of rotational type or non-rotational type. 146 Rotational uint64 147 // RQAffinity indicates affinity policy of device, if 1 the block layer will migrate request completions to the 148 // cpu “group” that originally submitted the request, if 2 forces the completion to run on the requesting cpu. 149 RQAffinity uint64 150 // SchedulerList contains list of available schedulers for this block device. 151 SchedulerList []string 152 // SchedulerCurrent is the current scheduler for this block device. 153 SchedulerCurrent string 154 // WriteCache shows the type of cache for block device, "write back" or "write through". 155 WriteCache string 156 // WriteSameMaxBytes is the number of bytes the device can write in a single write-same command. 157 // A value of ‘0’ means write-same is not supported by this device. 158 WriteSameMaxBytes uint64 159 // WBTLatUSec is the target minimum read latency, 0 means feature is disables. 160 WBTLatUSec int64 161 // ThrottleSampleTime is the time window that blk-throttle samples data, in millisecond. Optional 162 // exists only if CONFIG_BLK_DEV_THROTTLING_LOW is enabled. 163 ThrottleSampleTime *uint64 164 // Zoned indicates if the device is a zoned block device and the zone model of the device if it is indeed zoned. 165 // Possible values are: none, host-aware, host-managed for zoned block devices. 166 Zoned string 167 // NRZones indicates the total number of zones of the device, always zero for regular block devices. 168 NRZones uint64 169 // ChunksSectors for RAID is the size in 512B sectors of the RAID volume stripe segment, 170 // for zoned host device is the size in 512B sectors. 171 ChunkSectors uint64 172 // FUA indicates whether the device supports Force Unit Access for write requests. 173 FUA uint64 174 // MaxDiscardSegments is the maximum number of DMA entries in a discard request. 175 MaxDiscardSegments uint64 176 // WriteZeroesMaxBytes the maximum number of bytes that can be zeroed at once. 177 // The value 0 means that REQ_OP_WRITE_ZEROES is not supported. 178 WriteZeroesMaxBytes uint64 179} 180 181const ( 182 procDiskstatsPath = "diskstats" 183 procDiskstatsFormat = "%d %d %s %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d" 184 sysBlockPath = "block" 185 sysBlockStatFormat = "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d" 186 sysBlockQueue = "queue" 187) 188 189// FS represents the pseudo-filesystems proc and sys, which provides an 190// interface to kernel data structures. 191type FS struct { 192 proc *fs.FS 193 sys *fs.FS 194} 195 196// NewDefaultFS returns a new blockdevice fs using the default mountPoints for proc and sys. 197// It will error if either of these mount points can't be read. 198func NewDefaultFS() (FS, error) { 199 return NewFS(fs.DefaultProcMountPoint, fs.DefaultSysMountPoint) 200} 201 202// NewFS returns a new blockdevice fs using the given mountPoints for proc and sys. 203// It will error if either of these mount points can't be read. 204func NewFS(procMountPoint string, sysMountPoint string) (FS, error) { 205 if strings.TrimSpace(procMountPoint) == "" { 206 procMountPoint = fs.DefaultProcMountPoint 207 } 208 procfs, err := fs.NewFS(procMountPoint) 209 if err != nil { 210 return FS{}, err 211 } 212 if strings.TrimSpace(sysMountPoint) == "" { 213 sysMountPoint = fs.DefaultSysMountPoint 214 } 215 sysfs, err := fs.NewFS(sysMountPoint) 216 if err != nil { 217 return FS{}, err 218 } 219 return FS{&procfs, &sysfs}, nil 220} 221 222// ProcDiskstats reads the diskstats file and returns 223// an array of Diskstats (one per line/device) 224func (fs FS) ProcDiskstats() ([]Diskstats, error) { 225 file, err := os.Open(fs.proc.Path(procDiskstatsPath)) 226 if err != nil { 227 return nil, err 228 } 229 defer file.Close() 230 231 diskstats := []Diskstats{} 232 scanner := bufio.NewScanner(file) 233 for scanner.Scan() { 234 d := &Diskstats{} 235 d.IoStatsCount, err = fmt.Sscanf(scanner.Text(), procDiskstatsFormat, 236 &d.MajorNumber, 237 &d.MinorNumber, 238 &d.DeviceName, 239 &d.ReadIOs, 240 &d.ReadMerges, 241 &d.ReadSectors, 242 &d.ReadTicks, 243 &d.WriteIOs, 244 &d.WriteMerges, 245 &d.WriteSectors, 246 &d.WriteTicks, 247 &d.IOsInProgress, 248 &d.IOsTotalTicks, 249 &d.WeightedIOTicks, 250 &d.DiscardIOs, 251 &d.DiscardMerges, 252 &d.DiscardSectors, 253 &d.DiscardTicks, 254 &d.FlushRequestsCompleted, 255 &d.TimeSpentFlushing, 256 ) 257 // The io.EOF error can be safely ignored because it just means we read fewer than 258 // the full 20 fields. 259 if err != nil && err != io.EOF { 260 return diskstats, err 261 } 262 if d.IoStatsCount >= 14 { 263 diskstats = append(diskstats, *d) 264 } 265 } 266 return diskstats, scanner.Err() 267} 268 269// SysBlockDevices lists the device names from /sys/block/<dev> 270func (fs FS) SysBlockDevices() ([]string, error) { 271 deviceDirs, err := ioutil.ReadDir(fs.sys.Path(sysBlockPath)) 272 if err != nil { 273 return nil, err 274 } 275 devices := []string{} 276 for _, deviceDir := range deviceDirs { 277 if deviceDir.IsDir() { 278 devices = append(devices, deviceDir.Name()) 279 } 280 } 281 return devices, nil 282} 283 284// SysBlockDeviceStat returns stats for the block device read from /sys/block/<device>/stat. 285// The number of stats read will be 15 if the discard stats are available (kernel 4.18+) 286// and 11 if they are not available. 287func (fs FS) SysBlockDeviceStat(device string) (IOStats, int, error) { 288 stat := IOStats{} 289 bytes, err := ioutil.ReadFile(fs.sys.Path(sysBlockPath, device, "stat")) 290 if err != nil { 291 return stat, 0, err 292 } 293 count, err := fmt.Sscanf(strings.TrimSpace(string(bytes)), sysBlockStatFormat, 294 &stat.ReadIOs, 295 &stat.ReadMerges, 296 &stat.ReadSectors, 297 &stat.ReadTicks, 298 &stat.WriteIOs, 299 &stat.WriteMerges, 300 &stat.WriteSectors, 301 &stat.WriteTicks, 302 &stat.IOsInProgress, 303 &stat.IOsTotalTicks, 304 &stat.WeightedIOTicks, 305 &stat.DiscardIOs, 306 &stat.DiscardMerges, 307 &stat.DiscardSectors, 308 &stat.DiscardTicks, 309 ) 310 // An io.EOF error is ignored because it just means we read fewer than the full 15 fields. 311 if err == io.EOF { 312 return stat, count, nil 313 } 314 return stat, count, err 315} 316 317// SysBlockDeviceQueueStats returns stats for /sys/block/xxx/queue where xxx is a device name. 318func (fs FS) SysBlockDeviceQueueStats(device string) (BlockQueueStats, error) { 319 stat := BlockQueueStats{} 320 // files with uint64 fields 321 for file, p := range map[string]*uint64{ 322 "add_random": &stat.AddRandom, 323 "dax": &stat.DAX, 324 "discard_granularity": &stat.DiscardGranularity, 325 "discard_max_hw_bytes": &stat.DiscardMaxHWBytes, 326 "discard_max_bytes": &stat.DiscardMaxBytes, 327 "hw_sector_size": &stat.HWSectorSize, 328 "io_poll": &stat.IOPoll, 329 "io_timeout": &stat.IOTimeout, 330 "iostats": &stat.IOStats, 331 "logical_block_size": &stat.LogicalBlockSize, 332 "max_hw_sectors_kb": &stat.MaxHWSectorsKB, 333 "max_integrity_segments": &stat.MaxIntegritySegments, 334 "max_sectors_kb": &stat.MaxSectorsKB, 335 "max_segments": &stat.MaxSegments, 336 "max_segment_size": &stat.MaxSegmentSize, 337 "minimum_io_size": &stat.MinimumIOSize, 338 "nomerges": &stat.NoMerges, 339 "nr_requests": &stat.NRRequests, 340 "optimal_io_size": &stat.OptimalIOSize, 341 "physical_block_size": &stat.PhysicalBlockSize, 342 "read_ahead_kb": &stat.ReadAHeadKB, 343 "rotational": &stat.Rotational, 344 "rq_affinity": &stat.RQAffinity, 345 "write_same_max_bytes": &stat.WriteSameMaxBytes, 346 "nr_zones": &stat.NRZones, 347 "chunk_sectors": &stat.ChunkSectors, 348 "fua": &stat.FUA, 349 "max_discard_segments": &stat.MaxDiscardSegments, 350 "write_zeroes_max_bytes": &stat.WriteZeroesMaxBytes, 351 } { 352 val, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file)) 353 if err != nil { 354 return BlockQueueStats{}, err 355 } 356 *p = val 357 } 358 // files with int64 fields 359 for file, p := range map[string]*int64{ 360 "io_poll_delay": &stat.IOPollDelay, 361 "wbt_lat_usec": &stat.WBTLatUSec, 362 } { 363 val, err := util.ReadIntFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file)) 364 if err != nil { 365 return BlockQueueStats{}, err 366 } 367 *p = val 368 } 369 // files with string fields 370 for file, p := range map[string]*string{ 371 "write_cache": &stat.WriteCache, 372 "zoned": &stat.Zoned, 373 } { 374 val, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, file)) 375 if err != nil { 376 return BlockQueueStats{}, err 377 } 378 *p = val 379 } 380 scheduler, err := util.SysReadFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, "scheduler")) 381 if err != nil { 382 return BlockQueueStats{}, err 383 } 384 var schedulers []string 385 xs := strings.Split(scheduler, " ") 386 for _, s := range xs { 387 if strings.HasPrefix(s, "[") && strings.HasSuffix(s, "]") { 388 s = s[1 : len(s)-1] 389 stat.SchedulerCurrent = s 390 } 391 schedulers = append(schedulers, s) 392 } 393 stat.SchedulerList = schedulers 394 // optional 395 throttleSampleTime, err := util.ReadUintFromFile(fs.sys.Path(sysBlockPath, device, sysBlockQueue, "throttle_sample_time")) 396 if err == nil { 397 stat.ThrottleSampleTime = &throttleSampleTime 398 } 399 return stat, nil 400} 401