callgraph/vta/graph.go

// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package vta

import (
	"fmt"
	"go/token"
	"go/types"

	"golang.org/x/tools/go/callgraph"
	"golang.org/x/tools/go/ssa"
	"golang.org/x/tools/go/types/typeutil"
)

// node interface for VTA nodes.
type node interface {
	Type() types.Type
	String() string
}

// constant node for VTA.
type constant struct {
	typ types.Type
}

func (c constant) Type() types.Type {
	return c.typ
}

func (c constant) String() string {
	return fmt.Sprintf("Constant(%v)", c.Type())
}

// pointer node for VTA.
type pointer struct {
	typ *types.Pointer
}

func (p pointer) Type() types.Type {
	return p.typ
}

func (p pointer) String() string {
	return fmt.Sprintf("Pointer(%v)", p.Type())
}

// mapKey node for VTA, modeling reachable map key types.
type mapKey struct {
	typ types.Type
}

func (mk mapKey) Type() types.Type {
	return mk.typ
}

func (mk mapKey) String() string {
	return fmt.Sprintf("MapKey(%v)", mk.Type())
}

// mapValue node for VTA, modeling reachable map value types.
type mapValue struct {
	typ types.Type
}

func (mv mapValue) Type() types.Type {
	return mv.typ
}

func (mv mapValue) String() string {
	return fmt.Sprintf("MapValue(%v)", mv.Type())
}

// sliceElem node for VTA, modeling reachable slice element types.
type sliceElem struct {
	typ types.Type
}

func (s sliceElem) Type() types.Type {
	return s.typ
}

func (s sliceElem) String() string {
	return fmt.Sprintf("Slice([]%v)", s.Type())
}

// channelElem node for VTA, modeling reachable channel element types.
type channelElem struct {
	typ types.Type
}

func (c channelElem) Type() types.Type {
	return c.typ
}

func (c channelElem) String() string {
	return fmt.Sprintf("Channel(chan %v)", c.Type())
}

// field node for VTA.
type field struct {
	StructType types.Type
	index      int // index of the field in the struct
}

func (f field) Type() types.Type {
	s := f.StructType.Underlying().(*types.Struct)
	return s.Field(f.index).Type()
}

func (f field) String() string {
	s := f.StructType.Underlying().(*types.Struct)
	return fmt.Sprintf("Field(%v:%s)", f.StructType, s.Field(f.index).Name())
}

// global node for VTA.
type global struct {
	val *ssa.Global
}

func (g global) Type() types.Type {
	return g.val.Type()
}

func (g global) String() string {
	return fmt.Sprintf("Global(%s)", g.val.Name())
}

// local node for VTA modeling local variables
// and function/method parameters.
type local struct {
	val ssa.Value
}

func (l local) Type() types.Type {
	return l.val.Type()
}

func (l local) String() string {
	return fmt.Sprintf("Local(%s)", l.val.Name())
}

// indexedLocal node for VTA node. Models indexed locals
// related to the ssa extract instructions.
type indexedLocal struct {
	val   ssa.Value
	index int
	typ   types.Type
}

func (i indexedLocal) Type() types.Type {
	return i.typ
}

func (i indexedLocal) String() string {
	return fmt.Sprintf("Local(%s[%d])", i.val.Name(), i.index)
}

// function node for VTA.
type function struct {
	f *ssa.Function
}

func (f function) Type() types.Type {
	return f.f.Type()
}

func (f function) String() string {
	return fmt.Sprintf("Function(%s)", f.f.Name())
}

// nestedPtrInterface node represents all references and dereferences
// of locals and globals that have a nested pointer to interface type.
// We merge such constructs into a single node for simplicity and without
// much precision sacrifice as such variables are rare in practice. Both
// a and b would be represented as the same PtrInterface(I) node in:
//   type I interface
//   var a ***I
//   var b **I
type nestedPtrInterface struct {
	typ types.Type
}

func (l nestedPtrInterface) Type() types.Type {
	return l.typ
}

func (l nestedPtrInterface) String() string {
	return fmt.Sprintf("PtrInterface(%v)", l.typ)
}

// panicArg models types of all arguments passed to panic.
type panicArg struct{}

func (p panicArg) Type() types.Type {
	return nil
}

func (p panicArg) String() string {
	return "Panic"
}

// recoverReturn models types of all return values of recover().
type recoverReturn struct{}

func (r recoverReturn) Type() types.Type {
	return nil
}

func (r recoverReturn) String() string {
	return "Recover"
}

// vtaGraph remembers for each VTA node the set of its successors.
// Tailored for VTA, hence does not support singleton (sub)graphs.
type vtaGraph map[node]map[node]bool

// addEdge adds an edge x->y to the graph.
func (g vtaGraph) addEdge(x, y node) {
	succs, ok := g[x]
	if !ok {
		succs = make(map[node]bool)
		g[x] = succs
	}
	succs[y] = true
}

// successors returns all of n's immediate successors in the graph.
// The order of successor nodes is arbitrary.
func (g vtaGraph) successors(n node) []node {
	var succs []node
	for succ := range g[n] {
		succs = append(succs, succ)
	}
	return succs
}

// typePropGraph builds a VTA graph for a set of `funcs` and initial
// `callgraph` needed to establish interprocedural edges. Returns the
// graph and a map for unique type representatives.
func typePropGraph(funcs map[*ssa.Function]bool, callgraph *callgraph.Graph) (vtaGraph, *typeutil.Map) {
	b := builder{graph: make(vtaGraph), callGraph: callgraph}
	b.visit(funcs)
	return b.graph, &b.canon
}

// Data structure responsible for linearly traversing the
// code and building a VTA graph.
type builder struct {
	graph     vtaGraph
	callGraph *callgraph.Graph // initial call graph for creating flows at unresolved call sites.

	// Specialized type map for canonicalization of types.Type.
	// Semantically equivalent types can have different implementations,
	// i.e., they are different pointer values. The map allows us to
	// have one unique representative. The keys are fixed and from the
	// client perspective they are types. The values in our case are
	// types too, in particular type representatives. Each value is a
	// pointer so this map is not expected to take much memory.
	canon typeutil.Map
}

func (b *builder) visit(funcs map[*ssa.Function]bool) {
	// Add the fixed edge Panic -> Recover
	b.graph.addEdge(panicArg{}, recoverReturn{})

	for f, in := range funcs {
		if in {
			b.fun(f)
		}
	}
}

func (b *builder) fun(f *ssa.Function) {
	for _, bl := range f.Blocks {
		for _, instr := range bl.Instrs {
			b.instr(instr)
		}
	}
}

func (b *builder) instr(instr ssa.Instruction) {
	switch i := instr.(type) {
	case *ssa.Store:
		b.addInFlowAliasEdges(b.nodeFromVal(i.Addr), b.nodeFromVal(i.Val))
	case *ssa.MakeInterface:
		b.addInFlowEdge(b.nodeFromVal(i.X), b.nodeFromVal(i))
	case *ssa.MakeClosure:
		b.closure(i)
	case *ssa.UnOp:
		b.unop(i)
	case *ssa.Phi:
		b.phi(i)
	case *ssa.ChangeInterface:
		// Although in change interface a := A(b) command a and b are
		// the same object, the only interesting flow happens when A
		// is an interface. We create flow b -> a, but omit a -> b.
		// The latter flow is not needed: if a gets assigned concrete
		// type later on, that cannot be propagated back to b as b
		// is a separate variable. The a -> b flow can happen when
		// A is a pointer to interface, but then the command is of
		// type ChangeType, handled below.
		b.addInFlowEdge(b.nodeFromVal(i.X), b.nodeFromVal(i))
	case *ssa.ChangeType:
		// change type command a := A(b) results in a and b being the
		// same value. For concrete type A, there is no interesting flow.
		//
		// Note: When A is an interface, most interface casts are handled
		// by the ChangeInterface instruction. The relevant case here is
		// when converting a pointer to an interface type. This can happen
		// when the underlying interfaces have the same method set.
		//   type I interface{ foo() }
		//   type J interface{ foo() }
		//   var b *I
		//   a := (*J)(b)
		// When this happens we add flows between a <--> b.
		b.addInFlowAliasEdges(b.nodeFromVal(i), b.nodeFromVal(i.X))
	case *ssa.TypeAssert:
		b.tassert(i)
	case *ssa.Extract:
		b.extract(i)
	case *ssa.Field:
		b.field(i)
	case *ssa.FieldAddr:
		b.fieldAddr(i)
	case *ssa.Send:
		b.send(i)
	case *ssa.Select:
		b.selekt(i)
	case *ssa.Index:
		b.index(i)
	case *ssa.IndexAddr:
		b.indexAddr(i)
	case *ssa.Lookup:
		b.lookup(i)
	case *ssa.MapUpdate:
		b.mapUpdate(i)
	case *ssa.Next:
		b.next(i)
	case ssa.CallInstruction:
		b.call(i)
	case *ssa.Panic:
		b.panic(i)
	case *ssa.Return:
		b.rtrn(i)
	case *ssa.MakeChan, *ssa.MakeMap, *ssa.MakeSlice, *ssa.BinOp,
		*ssa.Alloc, *ssa.DebugRef, *ssa.Convert, *ssa.Jump, *ssa.If,
		*ssa.Slice, *ssa.Range, *ssa.RunDefers:
		// No interesting flow here.
		return
	default:
		panic(fmt.Sprintf("unsupported instruction %v\n", instr))
	}
}

func (b *builder) unop(u *ssa.UnOp) {
	switch u.Op {
	case token.MUL:
		// Multiplication operator * is used here as a dereference operator.
		b.addInFlowAliasEdges(b.nodeFromVal(u), b.nodeFromVal(u.X))
	case token.ARROW:
		t := u.X.Type().Underlying().(*types.Chan).Elem()
		b.addInFlowAliasEdges(b.nodeFromVal(u), channelElem{typ: t})
	default:
		// There is no interesting type flow otherwise.
	}
}

func (b *builder) phi(p *ssa.Phi) {
	for _, edge := range p.Edges {
		b.addInFlowAliasEdges(b.nodeFromVal(p), b.nodeFromVal(edge))
	}
}

func (b *builder) tassert(a *ssa.TypeAssert) {
	if !a.CommaOk {
		b.addInFlowEdge(b.nodeFromVal(a.X), b.nodeFromVal(a))
		return
	}
	// The case where a is <a.AssertedType, bool> register so there
	// is a flow from a.X to a[0]. Here, a[0] is represented as an
	// indexedLocal: an entry into local tuple register a at index 0.
	tup := a.Type().Underlying().(*types.Tuple)
	t := tup.At(0).Type()

	local := indexedLocal{val: a, typ: t, index: 0}
	b.addInFlowEdge(b.nodeFromVal(a.X), local)
}

// extract instruction t1 := t2[i] generates flows between t2[i]
// and t1 where the source is indexed local representing a value
// from tuple register t2 at index i and the target is t1.
func (b *builder) extract(e *ssa.Extract) {
	tup := e.Tuple.Type().Underlying().(*types.Tuple)
	t := tup.At(e.Index).Type()

	local := indexedLocal{val: e.Tuple, typ: t, index: e.Index}
	b.addInFlowAliasEdges(b.nodeFromVal(e), local)
}

func (b *builder) field(f *ssa.Field) {
	fnode := field{StructType: f.X.Type(), index: f.Field}
	b.addInFlowEdge(fnode, b.nodeFromVal(f))
}

func (b *builder) fieldAddr(f *ssa.FieldAddr) {
	t := f.X.Type().Underlying().(*types.Pointer).Elem()

	// Since we are getting pointer to a field, make a bidirectional edge.
	fnode := field{StructType: t, index: f.Field}
	b.addInFlowEdge(fnode, b.nodeFromVal(f))
	b.addInFlowEdge(b.nodeFromVal(f), fnode)
}

func (b *builder) send(s *ssa.Send) {
	t := s.Chan.Type().Underlying().(*types.Chan).Elem()
	b.addInFlowAliasEdges(channelElem{typ: t}, b.nodeFromVal(s.X))
}

// selekt generates flows for select statement
//   a = select blocking/nonblocking [c_1 <- t_1, c_2 <- t_2, ..., <- o_1, <- o_2, ...]
// between receiving channel registers c_i and corresponding input register t_i. Further,
// flows are generated between o_i and a[2 + i]. Note that a is a tuple register of type
// <int, bool, r_1, r_2, ...> where the type of r_i is the element type of channel o_i.
func (b *builder) selekt(s *ssa.Select) {
	recvIndex := 0
	for _, state := range s.States {
		t := state.Chan.Type().Underlying().(*types.Chan).Elem()

		if state.Dir == types.SendOnly {
			b.addInFlowAliasEdges(channelElem{typ: t}, b.nodeFromVal(state.Send))
		} else {
			// state.Dir == RecvOnly by definition of select instructions.
			tupEntry := indexedLocal{val: s, typ: t, index: 2 + recvIndex}
			b.addInFlowAliasEdges(tupEntry, channelElem{typ: t})
			recvIndex++
		}
	}
}

// index instruction a := b[c] on slices creates flows between a and
// SliceElem(t) flow where t is an interface type of c. Arrays and
// slice elements are both modeled as SliceElem.
func (b *builder) index(i *ssa.Index) {
	et := sliceArrayElem(i.X.Type())
	b.addInFlowAliasEdges(b.nodeFromVal(i), sliceElem{typ: et})
}

// indexAddr instruction a := &b[c] fetches address of a index
// into the field so we create bidirectional flow a <-> SliceElem(t)
// where t is an interface type of c. Arrays and slice elements are
// both modeled as SliceElem.
func (b *builder) indexAddr(i *ssa.IndexAddr) {
	et := sliceArrayElem(i.X.Type())
	b.addInFlowEdge(sliceElem{typ: et}, b.nodeFromVal(i))
	b.addInFlowEdge(b.nodeFromVal(i), sliceElem{typ: et})
}

// lookup handles map query commands a := m[b] where m is of type
// map[...]V and V is an interface. It creates flows between `a`
// and MapValue(V).
func (b *builder) lookup(l *ssa.Lookup) {
	t, ok := l.X.Type().Underlying().(*types.Map)
	if !ok {
		// No interesting flows for string lookups.
		return
	}
	b.addInFlowAliasEdges(b.nodeFromVal(l), mapValue{typ: t.Elem()})
}

// mapUpdate handles map update commands m[b] = a where m is of type
// map[K]V and K and V are interfaces. It creates flows between `a`
// and MapValue(V) as well as between MapKey(K) and `b`.
func (b *builder) mapUpdate(u *ssa.MapUpdate) {
	t, ok := u.Map.Type().Underlying().(*types.Map)
	if !ok {
		// No interesting flows for string updates.
		return
	}

	b.addInFlowAliasEdges(mapKey{typ: t.Key()}, b.nodeFromVal(u.Key))
	b.addInFlowAliasEdges(mapValue{typ: t.Elem()}, b.nodeFromVal(u.Value))
}

// next instruction <ok, key, value> := next r, where r
// is a range over map or string generates flow between
// key and MapKey as well value and MapValue nodes.
func (b *builder) next(n *ssa.Next) {
	if n.IsString {
		return
	}
	tup := n.Type().Underlying().(*types.Tuple)
	kt := tup.At(1).Type()
	vt := tup.At(2).Type()

	b.addInFlowAliasEdges(indexedLocal{val: n, typ: kt, index: 1}, mapKey{typ: kt})
	b.addInFlowAliasEdges(indexedLocal{val: n, typ: vt, index: 2}, mapValue{typ: vt})
}

// addInFlowAliasEdges adds an edge r -> l to b.graph if l is a node that can
// have an inflow, i.e., a node that represents an interface or an unresolved
// function value. Similarly for the edge l -> r with an additional condition
// of that l and r can potentially alias.
func (b *builder) addInFlowAliasEdges(l, r node) {
	b.addInFlowEdge(r, l)

	if canAlias(l, r) {
		b.addInFlowEdge(l, r)
	}
}

func (b *builder) closure(c *ssa.MakeClosure) {
	f := c.Fn.(*ssa.Function)
	b.addInFlowEdge(function{f: f}, b.nodeFromVal(c))

	for i, fv := range f.FreeVars {
		b.addInFlowAliasEdges(b.nodeFromVal(fv), b.nodeFromVal(c.Bindings[i]))
	}
}

// panic creates a flow from arguments to panic instructions to return
// registers of all recover statements in the program. Introduces a
// global panic node Panic and
//  1) for every panic statement p: add p -> Panic
//  2) for every recover statement r: add Panic -> r (handled in call)
// TODO(zpavlinovic): improve precision by explicitly modeling how panic
// values flow from callees to callers and into deferred recover instructions.
func (b *builder) panic(p *ssa.Panic) {
	// Panics often have, for instance, strings as arguments which do
	// not create interesting flows.
	if !canHaveMethods(p.X.Type()) {
		return
	}

	b.addInFlowEdge(b.nodeFromVal(p.X), panicArg{})
}

// call adds flows between arguments/parameters and return values/registers
// for both static and dynamic calls, as well as go and defer calls.
func (b *builder) call(c ssa.CallInstruction) {
	// When c is r := recover() call register instruction, we add Recover -> r.
	if bf, ok := c.Common().Value.(*ssa.Builtin); ok && bf.Name() == "recover" {
		b.addInFlowEdge(recoverReturn{}, b.nodeFromVal(c.(*ssa.Call)))
		return
	}

	for _, f := range siteCallees(c, b.callGraph) {
		addArgumentFlows(b, c, f)
	}
}

func addArgumentFlows(b *builder, c ssa.CallInstruction, f *ssa.Function) {
	cc := c.Common()
	// When c is an unresolved method call (cc.Method != nil), cc.Value contains
	// the receiver object rather than cc.Args[0].
	if cc.Method != nil {
		b.addInFlowAliasEdges(b.nodeFromVal(f.Params[0]), b.nodeFromVal(cc.Value))
	}

	offset := 0
	if cc.Method != nil {
		offset = 1
	}
	for i, v := range cc.Args {
		b.addInFlowAliasEdges(b.nodeFromVal(f.Params[i+offset]), b.nodeFromVal(v))
	}
}

// rtrn produces flows between values of r and c where
// c is a call instruction that resolves to the enclosing
// function of r based on b.callGraph.
func (b *builder) rtrn(r *ssa.Return) {
	n := b.callGraph.Nodes[r.Parent()]
	// n != nil when b.callgraph is sound, but the client can
	// pass any callgraph, including an underapproximate one.
	if n == nil {
		return
	}

	for _, e := range n.In {
		if cv, ok := e.Site.(ssa.Value); ok {
			addReturnFlows(b, r, cv)
		}
	}
}

func addReturnFlows(b *builder, r *ssa.Return, site ssa.Value) {
	results := r.Results
	if len(results) == 1 {
		// When there is only one return value, the destination register does not
		// have a tuple type.
		b.addInFlowEdge(b.nodeFromVal(results[0]), b.nodeFromVal(site))
		return
	}

	tup := site.Type().Underlying().(*types.Tuple)
	for i, r := range results {
		local := indexedLocal{val: site, typ: tup.At(i).Type(), index: i}
		b.addInFlowEdge(b.nodeFromVal(r), local)
	}
}

// addInFlowEdge adds s -> d to g if d is node that can have an inflow, i.e., a node
// that represents an interface or an unresolved function value. Otherwise, there
// is no interesting type flow so the edge is ommited.
func (b *builder) addInFlowEdge(s, d node) {
	if hasInFlow(d) {
		b.graph.addEdge(b.representative(s), b.representative(d))
	}
}

// Creates const, pointer, global, func, and local nodes based on register instructions.
func (b *builder) nodeFromVal(val ssa.Value) node {
	if p, ok := val.Type().(*types.Pointer); ok && !isInterface(p.Elem()) {
		// Nested pointer to interfaces are modeled as a special
		// nestedPtrInterface node.
		if i := interfaceUnderPtr(p.Elem()); i != nil {
			return nestedPtrInterface{typ: i}
		}
		return pointer{typ: p}
	}

	switch v := val.(type) {
	case *ssa.Const:
		return constant{typ: val.Type()}
	case *ssa.Global:
		return global{val: v}
	case *ssa.Function:
		return function{f: v}
	case *ssa.Parameter, *ssa.FreeVar, ssa.Instruction:
		// ssa.Param, ssa.FreeVar, and a specific set of "register" instructions,
		// satisifying the ssa.Value interface, can serve as local variables.
		return local{val: v}
	default:
		panic(fmt.Errorf("unsupported value %v in node creation", val))
	}
	return nil
}

// representative returns a unique representative for node `n`. Since
// semantically equivalent types can have different implementations,
// this method guarantees the same implementation is always used.
func (b *builder) representative(n node) node {
	if !hasInitialTypes(n) {
		return n
	}
	t := canonicalize(n.Type(), &b.canon)

	switch i := n.(type) {
	case constant:
		return constant{typ: t}
	case pointer:
		return pointer{typ: t.(*types.Pointer)}
	case sliceElem:
		return sliceElem{typ: t}
	case mapKey:
		return mapKey{typ: t}
	case mapValue:
		return mapValue{typ: t}
	case channelElem:
		return channelElem{typ: t}
	case nestedPtrInterface:
		return nestedPtrInterface{typ: t}
	case field:
		return field{StructType: canonicalize(i.StructType, &b.canon), index: i.index}
	case indexedLocal:
		return indexedLocal{typ: t, val: i.val, index: i.index}
	case local, global, panicArg, recoverReturn, function:
		return n
	default:
		panic(fmt.Errorf("canonicalizing unrecognized node %v", n))
	}
}

// canonicalize returns a type representative of `t` unique subject
// to type map `canon`.
func canonicalize(t types.Type, canon *typeutil.Map) types.Type {
	rep := canon.At(t)
	if rep != nil {
		return rep.(types.Type)
	}
	canon.Set(t, t)
	return t
}