1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package poll
6
7import (
8	"sync/atomic"
9	"syscall"
10	"unsafe"
11)
12
13const (
14	// spliceNonblock makes calls to splice(2) non-blocking.
15	spliceNonblock = 0x2
16
17	// maxSpliceSize is the maximum amount of data Splice asks
18	// the kernel to move in a single call to splice(2).
19	maxSpliceSize = 4 << 20
20)
21
22// Splice transfers at most remain bytes of data from src to dst, using the
23// splice system call to minimize copies of data from and to userspace.
24//
25// Splice creates a temporary pipe, to serve as a buffer for the data transfer.
26// src and dst must both be stream-oriented sockets.
27//
28// If err != nil, sc is the system call which caused the error.
29func Splice(dst, src *FD, remain int64) (written int64, handled bool, sc string, err error) {
30	prfd, pwfd, sc, err := newTempPipe()
31	if err != nil {
32		return 0, false, sc, err
33	}
34	defer destroyTempPipe(prfd, pwfd)
35	var inPipe, n int
36	for err == nil && remain > 0 {
37		max := maxSpliceSize
38		if int64(max) > remain {
39			max = int(remain)
40		}
41		inPipe, err = spliceDrain(pwfd, src, max)
42		// The operation is considered handled if splice returns no
43		// error, or an error other than EINVAL. An EINVAL means the
44		// kernel does not support splice for the socket type of src.
45		// The failed syscall does not consume any data so it is safe
46		// to fall back to a generic copy.
47		//
48		// spliceDrain should never return EAGAIN, so if err != nil,
49		// Splice cannot continue.
50		//
51		// If inPipe == 0 && err == nil, src is at EOF, and the
52		// transfer is complete.
53		handled = handled || (err != syscall.EINVAL)
54		if err != nil || (inPipe == 0 && err == nil) {
55			break
56		}
57		n, err = splicePump(dst, prfd, inPipe)
58		if n > 0 {
59			written += int64(n)
60			remain -= int64(n)
61		}
62	}
63	if err != nil {
64		return written, handled, "splice", err
65	}
66	return written, true, "", nil
67}
68
69// spliceDrain moves data from a socket to a pipe.
70//
71// Invariant: when entering spliceDrain, the pipe is empty. It is either in its
72// initial state, or splicePump has emptied it previously.
73//
74// Given this, spliceDrain can reasonably assume that the pipe is ready for
75// writing, so if splice returns EAGAIN, it must be because the socket is not
76// ready for reading.
77//
78// If spliceDrain returns (0, nil), src is at EOF.
79func spliceDrain(pipefd int, sock *FD, max int) (int, error) {
80	if err := sock.readLock(); err != nil {
81		return 0, err
82	}
83	defer sock.readUnlock()
84	if err := sock.pd.prepareRead(sock.isFile); err != nil {
85		return 0, err
86	}
87	for {
88		n, err := splice(pipefd, sock.Sysfd, max, spliceNonblock)
89		if err == syscall.EINTR {
90			continue
91		}
92		if err != syscall.EAGAIN {
93			return n, err
94		}
95		if err := sock.pd.waitRead(sock.isFile); err != nil {
96			return n, err
97		}
98	}
99}
100
101// splicePump moves all the buffered data from a pipe to a socket.
102//
103// Invariant: when entering splicePump, there are exactly inPipe
104// bytes of data in the pipe, from a previous call to spliceDrain.
105//
106// By analogy to the condition from spliceDrain, splicePump
107// only needs to poll the socket for readiness, if splice returns
108// EAGAIN.
109//
110// If splicePump cannot move all the data in a single call to
111// splice(2), it loops over the buffered data until it has written
112// all of it to the socket. This behavior is similar to the Write
113// step of an io.Copy in userspace.
114func splicePump(sock *FD, pipefd int, inPipe int) (int, error) {
115	if err := sock.writeLock(); err != nil {
116		return 0, err
117	}
118	defer sock.writeUnlock()
119	if err := sock.pd.prepareWrite(sock.isFile); err != nil {
120		return 0, err
121	}
122	written := 0
123	for inPipe > 0 {
124		n, err := splice(sock.Sysfd, pipefd, inPipe, spliceNonblock)
125		// Here, the condition n == 0 && err == nil should never be
126		// observed, since Splice controls the write side of the pipe.
127		if n > 0 {
128			inPipe -= n
129			written += n
130			continue
131		}
132		if err != syscall.EAGAIN {
133			return written, err
134		}
135		if err := sock.pd.waitWrite(sock.isFile); err != nil {
136			return written, err
137		}
138	}
139	return written, nil
140}
141
142// splice wraps the splice system call. Since the current implementation
143// only uses splice on sockets and pipes, the offset arguments are unused.
144// splice returns int instead of int64, because callers never ask it to
145// move more data in a single call than can fit in an int32.
146func splice(out int, in int, max int, flags int) (int, error) {
147	n, err := syscall.Splice(in, nil, out, nil, max, flags)
148	return int(n), err
149}
150
151var disableSplice unsafe.Pointer
152
153// newTempPipe sets up a temporary pipe for a splice operation.
154func newTempPipe() (prfd, pwfd int, sc string, err error) {
155	p := (*bool)(atomic.LoadPointer(&disableSplice))
156	if p != nil && *p {
157		return -1, -1, "splice", syscall.EINVAL
158	}
159
160	var fds [2]int
161	// pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it
162	// might not be implemented. Falling back to pipe is possible, but prior to
163	// 2.6.29 splice returns -EAGAIN instead of 0 when the connection is
164	// closed.
165	const flags = syscall.O_CLOEXEC | syscall.O_NONBLOCK
166	if err := syscall.Pipe2(fds[:], flags); err != nil {
167		return -1, -1, "pipe2", err
168	}
169
170	if p == nil {
171		p = new(bool)
172		defer atomic.StorePointer(&disableSplice, unsafe.Pointer(p))
173
174		// F_GETPIPE_SZ was added in 2.6.35, which does not have the -EAGAIN bug.
175		if syscall.F_GETPIPE_SZ == 0 {
176			*p = true
177			destroyTempPipe(fds[0], fds[1])
178			return -1, -1, "fcntl", syscall.EINVAL
179		}
180		if _, err := fcntl(fds[0], syscall.F_GETPIPE_SZ, 0); err != nil {
181			*p = true
182			destroyTempPipe(fds[0], fds[1])
183			return -1, -1, "fcntl", err
184		}
185	}
186
187	return fds[0], fds[1], "", nil
188}
189
190// destroyTempPipe destroys a temporary pipe.
191func destroyTempPipe(prfd, pwfd int) error {
192	err := CloseFunc(prfd)
193	err1 := CloseFunc(pwfd)
194	if err == nil {
195		return err1
196	}
197	return err
198}
199