lightstep/lightstep-tracer-go/tracer.go

// Package lightstep implements the LightStep OpenTracing client for Go.
package lightstep

import (
	"context"
	"fmt"
	"runtime"
	"sync"
	"time"

	"github.com/opentracing/opentracing-go"
)

// Tracer extends the `opentracing.Tracer` interface with methods for manual
// flushing and closing. To access these methods, you can take the global
// tracer and typecast it to a `lightstep.Tracer`. As a convenience, the
// lightstep package provides static functions which perform the typecasting.
type Tracer interface {
	opentracing.Tracer

	// Close flushes and then terminates the LightStep collector
	Close(context.Context)
	// Flush sends all spans currently in the buffer to the LighStep collector
	Flush(context.Context)
	// Options gets the Options used in New() or NewWithOptions().
	Options() Options
	// Disable prevents the tracer from recording spans or flushing
	Disable()
}

// Implements the `Tracer` interface. Buffers spans and forwards to a Lightstep collector.
type tracerImpl struct {
	//////////////////////////////////////////////////////////////
	// IMMUTABLE IMMUTABLE IMMUTABLE IMMUTABLE IMMUTABLE IMMUTABLE
	//////////////////////////////////////////////////////////////

	// Note: there may be a desire to update some of these fields
	// at runtime, in which case suitable changes may be needed
	// for variables accessed during Flush.

	reporterID uint64 // the LightStep tracer guid
	opts       Options

	// report loop management
	closeOnce               sync.Once
	closeReportLoopChannel  chan struct{}
	reportLoopClosedChannel chan struct{}

	converter   *protoConverter
	accessToken string
	attributes  map[string]string

	//////////////////////////////////////////////////////////
	// MUTABLE MUTABLE MUTABLE MUTABLE MUTABLE MUTABLE MUTABLE
	//////////////////////////////////////////////////////////

	// the following fields are modified under `lock`.
	lock sync.Mutex

	// Remote service that will receive reports.
	client     collectorClient
	connection Connection

	// Two buffers of data.
	buffer   reportBuffer
	flushing reportBuffer

	// Flush state.
	flushingLock      sync.Mutex
	reportInFlight    bool
	lastReportAttempt time.Time

	// Meta Event Reporting can be enabled at tracer creation or on-demand by satellite
	metaEventReportingEnabled bool
	// Set to true on first report
	firstReportHasRun bool

	// We allow our remote peer to disable this instrumentation at any
	// time, turning all potentially costly runtime operations into
	// no-ops.
	//
	// TODO this should use atomic load/store to test disabled
	// prior to taking the lock, do please.
	disabled bool

	// Map of propagators used to determine the correct propagator to use
	// based on the format passed into Inject/Extract. Supports one
	// propagator for each of the formats: TextMap, HTTPHeaders, Binary
	propagators map[opentracing.BuiltinFormat]Propagator
}

// NewTracer creates and starts a new Lightstep Tracer.
// In case of error, we emit event and return nil.
func NewTracer(opts Options) Tracer {
	tr, err := CreateTracer(opts)
	if err != nil  {
		emitEvent(newEventStartError(err))
		return nil
	}
	return tr
}

// CreateTracer creates and starts a new Lightstep Tracer.
// It is meant to replace NewTracer which does not propagate the error.
func CreateTracer(opts Options) (Tracer, error) {
	if err := opts.Initialize(); err != nil {
		return nil, fmt.Errorf("init; err: %v", err)
	}

	attributes := map[string]string{}
	for k, v := range opts.Tags {
		attributes[k] = fmt.Sprint(v)
	}
	// Don't let the GrpcOptions override these values. That would be confusing.
	attributes[TracerPlatformKey] = TracerPlatformValue
	attributes[TracerPlatformVersionKey] = runtime.Version()
	attributes[TracerVersionKey] = TracerVersionValue

	now := time.Now()
	impl := &tracerImpl{
		opts:                    opts,
		reporterID:              genSeededGUID(),
		buffer:                  newSpansBuffer(opts.MaxBufferedSpans),
		flushing:                newSpansBuffer(opts.MaxBufferedSpans),
		closeReportLoopChannel:  make(chan struct{}),
		reportLoopClosedChannel: make(chan struct{}),
		converter:               newProtoConverter(opts),
		accessToken:             opts.AccessToken,
		attributes:              attributes,
	}

	impl.buffer.setCurrent(now)

	var err error
	impl.client, err = newCollectorClient(opts)
	if err != nil {
		return nil, fmt.Errorf("create collector client; err: %v", err)
	}

	conn, err := impl.client.ConnectClient()
	if err != nil {
		return nil, err
	}
	impl.connection = conn

	// set meta reporting to defined option
	impl.metaEventReportingEnabled = opts.MetaEventReportingEnabled
	impl.firstReportHasRun = false

	go impl.reportLoop()

	impl.propagators = map[opentracing.BuiltinFormat]Propagator{
		opentracing.TextMap:     theLightStepPropagator,
		opentracing.HTTPHeaders: theLightStepPropagator,
		opentracing.Binary:      theBinaryPropagator,
	}

	if opts.Propagator == "b3" {
		impl.propagators[opentracing.TextMap] = theB3Propagator
		impl.propagators[opentracing.HTTPHeaders] = theB3Propagator
	}

	return impl, nil
}

func (tracer *tracerImpl) Options() Options {
	return tracer.opts
}

func (tracer *tracerImpl) StartSpan(
	operationName string,
	sso ...opentracing.StartSpanOption,
) opentracing.Span {
	return newSpan(operationName, tracer, sso)
}

func (tracer *tracerImpl) Inject(sc opentracing.SpanContext, format interface{}, carrier interface{}) error {
	if tracer.opts.MetaEventReportingEnabled {
		opentracing.StartSpan(LSMetaEvent_InjectOperation,
			opentracing.Tag{Key: LSMetaEvent_MetaEventKey, Value: true},
			opentracing.Tag{Key: LSMetaEvent_TraceIdKey, Value: sc.(SpanContext).TraceID},
			opentracing.Tag{Key: LSMetaEvent_SpanIdKey, Value: sc.(SpanContext).SpanID},
			opentracing.Tag{Key: LSMetaEvent_PropagationFormatKey, Value: format}).
			Finish()
	}

	builtin, ok := format.(opentracing.BuiltinFormat)
	if !ok {
		return opentracing.ErrUnsupportedFormat
	}
	return tracer.propagators[builtin].Inject(sc, carrier)
}

func (tracer *tracerImpl) Extract(format interface{}, carrier interface{}) (opentracing.SpanContext, error) {
	if tracer.opts.MetaEventReportingEnabled {
		opentracing.StartSpan(LSMetaEvent_ExtractOperation,
			opentracing.Tag{Key: LSMetaEvent_MetaEventKey, Value: true},
			opentracing.Tag{Key: LSMetaEvent_PropagationFormatKey, Value: format}).
			Finish()
	}
	builtin, ok := format.(opentracing.BuiltinFormat)
	if !ok {
		return nil, opentracing.ErrUnsupportedFormat
	}
	return tracer.propagators[builtin].Extract(carrier)
}

func (tracer *tracerImpl) reconnectClient(now time.Time) {
	conn, err := tracer.client.ConnectClient()
	if err != nil {
		emitEvent(newEventConnectionError(err))
	} else {
		tracer.lock.Lock()
		oldConn := tracer.connection
		tracer.connection = conn
		tracer.lock.Unlock()

		oldConn.Close()
	}
}

// Close flushes and then terminates the LightStep collector. Close may only be
// called once; subsequent calls to Close are no-ops.
func (tracer *tracerImpl) Close(ctx context.Context) {
	tracer.closeOnce.Do(func() {
		// notify report loop that we are closing
		close(tracer.closeReportLoopChannel)
		select {
		case <-tracer.reportLoopClosedChannel:
			tracer.Flush(ctx)
		case <-ctx.Done():
			return
		}

		// now its safe to close the connection
		tracer.lock.Lock()
		conn := tracer.connection
		tracer.connection = nil
		tracer.lock.Unlock()

		if conn != nil {
			err := conn.Close()
			if err != nil {
				emitEvent(newEventConnectionError(err))
			}
		}
	})
}

// RecordSpan records a finished Span.
func (tracer *tracerImpl) RecordSpan(raw RawSpan) {
	tracer.lock.Lock()

	// Early-out for disabled runtimes
	if tracer.disabled {
		tracer.lock.Unlock()
		return
	}

	tracer.buffer.addSpan(raw)
	tracer.lock.Unlock()

	if tracer.opts.Recorder != nil {
		tracer.opts.Recorder.RecordSpan(raw)
	}
}

// Flush sends all buffered data to the collector.
func (tracer *tracerImpl) Flush(ctx context.Context) {
	tracer.flushingLock.Lock()
	defer tracer.flushingLock.Unlock()

	if errorEvent := tracer.preFlush(); errorEvent != nil {
		emitEvent(errorEvent)
		return
	}

	if tracer.opts.MetaEventReportingEnabled && !tracer.firstReportHasRun {
		opentracing.StartSpan(LSMetaEvent_TracerCreateOperation,
			opentracing.Tag{Key: LSMetaEvent_MetaEventKey, Value: true},
			opentracing.Tag{Key: LSMetaEvent_TracerGuidKey, Value: tracer.reporterID}).
			Finish()
		tracer.firstReportHasRun = true
	}

	ctx, cancel := context.WithTimeout(ctx, tracer.opts.ReportTimeout)
	defer cancel()

	protoReq := tracer.converter.toReportRequest(
		tracer.reporterID,
		tracer.attributes,
		tracer.accessToken,
		&tracer.flushing,
	)
	req, err := tracer.client.Translate(protoReq)
	if err != nil {
		errorEvent := newEventFlushError(err, FlushErrorTranslate)
		emitEvent(errorEvent)
		// call postflush to prevent the tracer from going into an invalid state.
		emitEvent(tracer.postFlush(errorEvent))
		return
	}

	var reportErrorEvent *eventFlushError
	resp, err := tracer.client.Report(ctx, req)
	if err != nil {
		reportErrorEvent = newEventFlushError(err, FlushErrorTransport)
	} else if len(resp.GetErrors()) > 0 {
		reportErrorEvent = newEventFlushError(fmt.Errorf(resp.GetErrors()[0]), FlushErrorReport)
	}

	if reportErrorEvent != nil {
		emitEvent(reportErrorEvent)
	}
	emitEvent(tracer.postFlush(reportErrorEvent))

	if err == nil && resp.DevMode() {
		tracer.metaEventReportingEnabled = true
	}

	if err == nil && !resp.DevMode() {
		tracer.metaEventReportingEnabled = false
	}

	if err == nil && resp.Disable() {
		tracer.Disable()
	}
}

// preFlush handles lock-protected data manipulation before flushing
func (tracer *tracerImpl) preFlush() *eventFlushError {
	tracer.lock.Lock()
	defer tracer.lock.Unlock()

	if tracer.disabled {
		return newEventFlushError(errFlushFailedTracerClosed, FlushErrorTracerDisabled)
	}

	if tracer.connection == nil {
		return newEventFlushError(errFlushFailedTracerClosed, FlushErrorTracerClosed)
	}

	now := time.Now()
	tracer.buffer, tracer.flushing = tracer.flushing, tracer.buffer
	tracer.reportInFlight = true
	tracer.flushing.setFlushing(now)
	tracer.buffer.setCurrent(now)
	tracer.lastReportAttempt = now
	return nil
}

// postFlush handles lock-protected data manipulation after flushing
func (tracer *tracerImpl) postFlush(flushEventError *eventFlushError) *eventStatusReport {
	tracer.lock.Lock()
	defer tracer.lock.Unlock()

	tracer.reportInFlight = false

	statusReportEvent := newEventStatusReport(
		tracer.flushing.reportStart,
		tracer.flushing.reportEnd,
		len(tracer.flushing.rawSpans),
		int(tracer.flushing.droppedSpanCount+tracer.buffer.droppedSpanCount),
		int(tracer.flushing.logEncoderErrorCount+tracer.buffer.logEncoderErrorCount),
	)

	if flushEventError == nil {
		tracer.flushing.clear()
		return statusReportEvent
	}

	switch flushEventError.State() {
	case FlushErrorTranslate:
		// When there's a translation error, we do not want to retry.
		tracer.flushing.clear()
	default:
		// Restore the records that did not get sent correctly
		tracer.buffer.mergeFrom(&tracer.flushing)
	}

	statusReportEvent.SetSentSpans(0)

	return statusReportEvent
}

func (tracer *tracerImpl) Disable() {
	tracer.lock.Lock()
	if tracer.disabled {
		tracer.lock.Unlock()
		return
	}
	tracer.disabled = true
	tracer.buffer.clear()
	tracer.lock.Unlock()

	emitEvent(newEventTracerDisabled())
}

// Every MinReportingPeriod the reporting loop wakes up and checks to see if
// either (a) the Runtime's max reporting period is about to expire (see
// maxReportingPeriod()), (b) the number of buffered log records is
// approaching kMaxBufferedLogs, or if (c) the number of buffered span records
// is approaching kMaxBufferedSpans. If any of those conditions are true,
// pending data is flushed to the remote peer. If not, the reporting loop waits
// until the next cycle. See Runtime.maybeFlush() for details.
//
// This could alternatively be implemented using flush channels and so forth,
// but that would introduce opportunities for client code to block on the
// runtime library, and we want to avoid that at all costs (even dropping data,
// which can certainly happen with high data rates and/or unresponsive remote
// peers).

func (tracer *tracerImpl) shouldFlushLocked(now time.Time) bool {
	if now.Add(tracer.opts.MinReportingPeriod).Sub(tracer.lastReportAttempt) > tracer.opts.ReportingPeriod {
		return true
	} else if tracer.buffer.isHalfFull() {
		return true
	}
	return false
}

func (tracer *tracerImpl) reportLoop() {
	tickerChan := time.Tick(tracer.opts.MinReportingPeriod)
	for {
		select {
		case <-tickerChan:
			now := time.Now()

			tracer.lock.Lock()
			disabled := tracer.disabled
			reconnect := !tracer.reportInFlight && tracer.client.ShouldReconnect()
			shouldFlush := tracer.shouldFlushLocked(now)
			tracer.lock.Unlock()

			if disabled {
				return
			}
			if shouldFlush {
				tracer.Flush(context.Background())
			}
			if reconnect {
				tracer.reconnectClient(now)
			}
		case <-tracer.closeReportLoopChannel:
			close(tracer.reportLoopClosedChannel)
			return
		}
	}
}