1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package pipeline
6
7import (
8	"bytes"
9	"fmt"
10	"go/ast"
11	"go/constant"
12	"go/format"
13	"go/token"
14	"go/types"
15	"path/filepath"
16	"strings"
17	"unicode"
18	"unicode/utf8"
19
20	fmtparser "golang.org/x/text/internal/format"
21	"golang.org/x/tools/go/callgraph"
22	"golang.org/x/tools/go/callgraph/cha"
23	"golang.org/x/tools/go/loader"
24	"golang.org/x/tools/go/ssa"
25	"golang.org/x/tools/go/ssa/ssautil"
26)
27
28const debug = false
29
30// TODO:
31// - merge information into existing files
32// - handle different file formats (PO, XLIFF)
33// - handle features (gender, plural)
34// - message rewriting
35
36// - `msg:"etc"` tags
37
38// Extract extracts all strings form the package defined in Config.
39func Extract(c *Config) (*State, error) {
40	x, err := newExtracter(c)
41	if err != nil {
42		return nil, wrap(err, "")
43	}
44
45	x.seedEndpoints()
46	x.extractMessages()
47
48	return &State{
49		Config:  *c,
50		program: x.iprog,
51		Extracted: Messages{
52			Language: c.SourceLanguage,
53			Messages: x.messages,
54		},
55	}, nil
56}
57
58type extracter struct {
59	conf      loader.Config
60	iprog     *loader.Program
61	prog      *ssa.Program
62	callGraph *callgraph.Graph
63
64	// Calls and other expressions to collect.
65	globals  map[token.Pos]*constData
66	funcs    map[token.Pos]*callData
67	messages []Message
68}
69
70func newExtracter(c *Config) (x *extracter, err error) {
71	x = &extracter{
72		conf:    loader.Config{},
73		globals: map[token.Pos]*constData{},
74		funcs:   map[token.Pos]*callData{},
75	}
76
77	x.iprog, err = loadPackages(&x.conf, c.Packages)
78	if err != nil {
79		return nil, wrap(err, "")
80	}
81
82	x.prog = ssautil.CreateProgram(x.iprog, ssa.GlobalDebug|ssa.BareInits)
83	x.prog.Build()
84
85	x.callGraph = cha.CallGraph(x.prog)
86
87	return x, nil
88}
89
90func (x *extracter) globalData(pos token.Pos) *constData {
91	cd := x.globals[pos]
92	if cd == nil {
93		cd = &constData{}
94		x.globals[pos] = cd
95	}
96	return cd
97}
98
99func (x *extracter) seedEndpoints() {
100	pkg := x.prog.Package(x.iprog.Package("golang.org/x/text/message").Pkg)
101	typ := types.NewPointer(pkg.Type("Printer").Type())
102
103	x.processGlobalVars()
104
105	x.handleFunc(x.prog.LookupMethod(typ, pkg.Pkg, "Printf"), &callData{
106		formatPos: 1,
107		argPos:    2,
108		isMethod:  true,
109	})
110	x.handleFunc(x.prog.LookupMethod(typ, pkg.Pkg, "Sprintf"), &callData{
111		formatPos: 1,
112		argPos:    2,
113		isMethod:  true,
114	})
115	x.handleFunc(x.prog.LookupMethod(typ, pkg.Pkg, "Fprintf"), &callData{
116		formatPos: 2,
117		argPos:    3,
118		isMethod:  true,
119	})
120}
121
122// processGlobalVars finds string constants that are assigned to global
123// variables.
124func (x *extracter) processGlobalVars() {
125	for _, p := range x.prog.AllPackages() {
126		m, ok := p.Members["init"]
127		if !ok {
128			continue
129		}
130		for _, b := range m.(*ssa.Function).Blocks {
131			for _, i := range b.Instrs {
132				s, ok := i.(*ssa.Store)
133				if !ok {
134					continue
135				}
136				a, ok := s.Addr.(*ssa.Global)
137				if !ok {
138					continue
139				}
140				t := a.Type()
141				for {
142					p, ok := t.(*types.Pointer)
143					if !ok {
144						break
145					}
146					t = p.Elem()
147				}
148				if b, ok := t.(*types.Basic); !ok || b.Kind() != types.String {
149					continue
150				}
151				x.visitInit(a, s.Val)
152			}
153		}
154	}
155}
156
157type constData struct {
158	call   *callData // to provide a signature for the constants
159	values []constVal
160	others []token.Pos // Assigned to other global data.
161}
162
163func (d *constData) visit(x *extracter, f func(c constant.Value)) {
164	for _, v := range d.values {
165		f(v.value)
166	}
167	for _, p := range d.others {
168		if od, ok := x.globals[p]; ok {
169			od.visit(x, f)
170		}
171	}
172}
173
174type constVal struct {
175	value constant.Value
176	pos   token.Pos
177}
178
179type callData struct {
180	call    ssa.CallInstruction
181	expr    *ast.CallExpr
182	formats []constant.Value
183
184	callee    *callData
185	isMethod  bool
186	formatPos int
187	argPos    int   // varargs at this position in the call
188	argTypes  []int // arguments extractable from this position
189}
190
191func (c *callData) callFormatPos() int {
192	c = c.callee
193	if c.isMethod {
194		return c.formatPos - 1
195	}
196	return c.formatPos
197}
198
199func (c *callData) callArgsStart() int {
200	c = c.callee
201	if c.isMethod {
202		return c.argPos - 1
203	}
204	return c.argPos
205}
206
207func (c *callData) Pos() token.Pos      { return c.call.Pos() }
208func (c *callData) Pkg() *types.Package { return c.call.Parent().Pkg.Pkg }
209
210func (x *extracter) handleFunc(f *ssa.Function, fd *callData) {
211	for _, e := range x.callGraph.Nodes[f].In {
212		if e.Pos() == 0 {
213			continue
214		}
215
216		call := e.Site
217		caller := x.funcs[call.Pos()]
218		if caller != nil {
219			// TODO: theoretically a format string could be passed to multiple
220			// arguments of a function. Support this eventually.
221			continue
222		}
223		x.debug(call, "CALL", f.String())
224
225		caller = &callData{
226			call:      call,
227			callee:    fd,
228			formatPos: -1,
229			argPos:    -1,
230		}
231		// Offset by one if we are invoking an interface method.
232		offset := 0
233		if call.Common().IsInvoke() {
234			offset = -1
235		}
236		x.funcs[call.Pos()] = caller
237		if fd.argPos >= 0 {
238			x.visitArgs(caller, call.Common().Args[fd.argPos+offset])
239		}
240		x.visitFormats(caller, call.Common().Args[fd.formatPos+offset])
241	}
242}
243
244type posser interface {
245	Pos() token.Pos
246	Parent() *ssa.Function
247}
248
249func (x *extracter) debug(v posser, header string, args ...interface{}) {
250	if debug {
251		pos := ""
252		if p := v.Parent(); p != nil {
253			pos = posString(&x.conf, p.Package().Pkg, v.Pos())
254		}
255		if header != "CALL" && header != "INSERT" {
256			header = "  " + header
257		}
258		fmt.Printf("%-32s%-10s%-15T ", pos+fmt.Sprintf("@%d", v.Pos()), header, v)
259		for _, a := range args {
260			fmt.Printf(" %v", a)
261		}
262		fmt.Println()
263	}
264}
265
266// visitInit evaluates and collects values assigned to global variables in an
267// init function.
268func (x *extracter) visitInit(global *ssa.Global, v ssa.Value) {
269	if v == nil {
270		return
271	}
272	x.debug(v, "GLOBAL", v)
273
274	switch v := v.(type) {
275	case *ssa.Phi:
276		for _, e := range v.Edges {
277			x.visitInit(global, e)
278		}
279
280	case *ssa.Const:
281		// Only record strings with letters.
282		if str := constant.StringVal(v.Value); isMsg(str) {
283			cd := x.globalData(global.Pos())
284			cd.values = append(cd.values, constVal{v.Value, v.Pos()})
285		}
286		// TODO: handle %m-directive.
287
288	case *ssa.Global:
289		cd := x.globalData(global.Pos())
290		cd.others = append(cd.others, v.Pos())
291
292	case *ssa.FieldAddr, *ssa.Field:
293		// TODO: mark field index v.Field of v.X.Type() for extraction. extract
294		// an example args as to give parameters for the translator.
295
296	case *ssa.Slice:
297		if v.Low == nil && v.High == nil && v.Max == nil {
298			x.visitInit(global, v.X)
299		}
300
301	case *ssa.Alloc:
302		if ref := v.Referrers(); ref == nil {
303			for _, r := range *ref {
304				values := []ssa.Value{}
305				for _, o := range r.Operands(nil) {
306					if o == nil || *o == v {
307						continue
308					}
309					values = append(values, *o)
310				}
311				// TODO: return something different if we care about multiple
312				// values as well.
313				if len(values) == 1 {
314					x.visitInit(global, values[0])
315				}
316			}
317		}
318
319	case ssa.Instruction:
320		rands := v.Operands(nil)
321		if len(rands) == 1 && rands[0] != nil {
322			x.visitInit(global, *rands[0])
323		}
324	}
325	return
326}
327
328// visitFormats finds the original source of the value. The returned index is
329// position of the argument if originated from a function argument or -1
330// otherwise.
331func (x *extracter) visitFormats(call *callData, v ssa.Value) {
332	if v == nil {
333		return
334	}
335	x.debug(v, "VALUE", v)
336
337	switch v := v.(type) {
338	case *ssa.Phi:
339		for _, e := range v.Edges {
340			x.visitFormats(call, e)
341		}
342
343	case *ssa.Const:
344		// Only record strings with letters.
345		if isMsg(constant.StringVal(v.Value)) {
346			x.debug(call.call, "FORMAT", v.Value.ExactString())
347			call.formats = append(call.formats, v.Value)
348		}
349		// TODO: handle %m-directive.
350
351	case *ssa.Global:
352		x.globalData(v.Pos()).call = call
353
354	case *ssa.FieldAddr, *ssa.Field:
355		// TODO: mark field index v.Field of v.X.Type() for extraction. extract
356		// an example args as to give parameters for the translator.
357
358	case *ssa.Slice:
359		if v.Low == nil && v.High == nil && v.Max == nil {
360			x.visitFormats(call, v.X)
361		}
362
363	case *ssa.Parameter:
364		// TODO: handle the function for the index parameter.
365		f := v.Parent()
366		for i, p := range f.Params {
367			if p == v {
368				if call.formatPos < 0 {
369					call.formatPos = i
370					// TODO: is there a better way to detect this is calling
371					// a method rather than a function?
372					call.isMethod = len(f.Params) > f.Signature.Params().Len()
373					x.handleFunc(v.Parent(), call)
374				} else if debug && i != call.formatPos {
375					// TODO: support this.
376					fmt.Printf("WARNING:%s: format string passed to arg %d and %d\n",
377						posString(&x.conf, call.Pkg(), call.Pos()),
378						call.formatPos, i)
379				}
380			}
381		}
382
383	case *ssa.Alloc:
384		if ref := v.Referrers(); ref == nil {
385			for _, r := range *ref {
386				values := []ssa.Value{}
387				for _, o := range r.Operands(nil) {
388					if o == nil || *o == v {
389						continue
390					}
391					values = append(values, *o)
392				}
393				// TODO: return something different if we care about multiple
394				// values as well.
395				if len(values) == 1 {
396					x.visitFormats(call, values[0])
397				}
398			}
399		}
400
401		// TODO:
402	// case *ssa.Index:
403	// 	// Get all values in the array if applicable
404	// case *ssa.IndexAddr:
405	// 	// Get all values in the slice or *array if applicable.
406	// case *ssa.Lookup:
407	// 	// Get all values in the map if applicable.
408
409	case *ssa.FreeVar:
410		// TODO: find the link between free variables and parameters:
411		//
412		// func freeVar(p *message.Printer, str string) {
413		// 	fn := func(p *message.Printer) {
414		// 		p.Printf(str)
415		// 	}
416		// 	fn(p)
417		// }
418
419	case ssa.Instruction:
420		rands := v.Operands(nil)
421		if len(rands) == 1 && rands[0] != nil {
422			x.visitFormats(call, *rands[0])
423		}
424	case *ssa.Call:
425	}
426}
427
428// Note: a function may have an argument marked as both format and passthrough.
429
430// visitArgs collects information on arguments. For wrapped functions it will
431// just determine the position of the variable args slice.
432func (x *extracter) visitArgs(fd *callData, v ssa.Value) {
433	if v == nil {
434		return
435	}
436	x.debug(v, "ARGV", v)
437	switch v := v.(type) {
438
439	case *ssa.Slice:
440		if v.Low == nil && v.High == nil && v.Max == nil {
441			x.visitArgs(fd, v.X)
442		}
443
444	case *ssa.Parameter:
445		// TODO: handle the function for the index parameter.
446		f := v.Parent()
447		for i, p := range f.Params {
448			if p == v {
449				fd.argPos = i
450			}
451		}
452
453	case *ssa.Alloc:
454		if ref := v.Referrers(); ref == nil {
455			for _, r := range *ref {
456				values := []ssa.Value{}
457				for _, o := range r.Operands(nil) {
458					if o == nil || *o == v {
459						continue
460					}
461					values = append(values, *o)
462				}
463				// TODO: return something different if we care about
464				// multiple values as well.
465				if len(values) == 1 {
466					x.visitArgs(fd, values[0])
467				}
468			}
469		}
470
471	case ssa.Instruction:
472		rands := v.Operands(nil)
473		if len(rands) == 1 && rands[0] != nil {
474			x.visitArgs(fd, *rands[0])
475		}
476	}
477}
478
479// print returns Go syntax for the specified node.
480func (x *extracter) print(n ast.Node) string {
481	var buf bytes.Buffer
482	format.Node(&buf, x.conf.Fset, n)
483	return buf.String()
484}
485
486type packageExtracter struct {
487	f    *ast.File
488	x    *extracter
489	info *loader.PackageInfo
490	cmap ast.CommentMap
491}
492
493func (px packageExtracter) getComment(n ast.Node) string {
494	cs := px.cmap.Filter(n).Comments()
495	if len(cs) > 0 {
496		return strings.TrimSpace(cs[0].Text())
497	}
498	return ""
499}
500
501func (x *extracter) extractMessages() {
502	prog := x.iprog
503	files := []packageExtracter{}
504	for _, info := range x.iprog.AllPackages {
505		for _, f := range info.Files {
506			// Associate comments with nodes.
507			px := packageExtracter{
508				f, x, info,
509				ast.NewCommentMap(prog.Fset, f, f.Comments),
510			}
511			files = append(files, px)
512		}
513	}
514	for _, px := range files {
515		ast.Inspect(px.f, func(n ast.Node) bool {
516			switch v := n.(type) {
517			case *ast.CallExpr:
518				if d := x.funcs[v.Lparen]; d != nil {
519					d.expr = v
520				}
521			}
522			return true
523		})
524	}
525	for _, px := range files {
526		ast.Inspect(px.f, func(n ast.Node) bool {
527			switch v := n.(type) {
528			case *ast.CallExpr:
529				return px.handleCall(v)
530			case *ast.ValueSpec:
531				return px.handleGlobal(v)
532			}
533			return true
534		})
535	}
536}
537
538func (px packageExtracter) handleGlobal(spec *ast.ValueSpec) bool {
539	comment := px.getComment(spec)
540
541	for _, ident := range spec.Names {
542		data, ok := px.x.globals[ident.Pos()]
543		if !ok {
544			continue
545		}
546		name := ident.Name
547		var arguments []argument
548		if data.call != nil {
549			arguments = px.getArguments(data.call)
550		} else if !strings.HasPrefix(name, "msg") && !strings.HasPrefix(name, "Msg") {
551			continue
552		}
553		data.visit(px.x, func(c constant.Value) {
554			px.addMessage(spec.Pos(), []string{name}, c, comment, arguments)
555		})
556	}
557
558	return true
559}
560
561func (px packageExtracter) handleCall(call *ast.CallExpr) bool {
562	x := px.x
563	data := x.funcs[call.Lparen]
564	if data == nil || len(data.formats) == 0 {
565		return true
566	}
567	if data.expr != call {
568		panic("invariant `data.call != call` failed")
569	}
570	x.debug(data.call, "INSERT", data.formats)
571
572	argn := data.callFormatPos()
573	if argn >= len(call.Args) {
574		return true
575	}
576	format := call.Args[argn]
577
578	arguments := px.getArguments(data)
579
580	comment := ""
581	key := []string{}
582	if ident, ok := format.(*ast.Ident); ok {
583		key = append(key, ident.Name)
584		if v, ok := ident.Obj.Decl.(*ast.ValueSpec); ok && v.Comment != nil {
585			// TODO: get comment above ValueSpec as well
586			comment = v.Comment.Text()
587		}
588	}
589	if c := px.getComment(call.Args[0]); c != "" {
590		comment = c
591	}
592
593	formats := data.formats
594	for _, c := range formats {
595		px.addMessage(call.Lparen, key, c, comment, arguments)
596	}
597	return true
598}
599
600func (px packageExtracter) getArguments(data *callData) []argument {
601	arguments := []argument{}
602	x := px.x
603	info := px.info
604	if data.callArgsStart() >= 0 {
605		args := data.expr.Args[data.callArgsStart():]
606		for i, arg := range args {
607			expr := x.print(arg)
608			val := ""
609			if v := info.Types[arg].Value; v != nil {
610				val = v.ExactString()
611				switch arg.(type) {
612				case *ast.BinaryExpr, *ast.UnaryExpr:
613					expr = val
614				}
615			}
616			arguments = append(arguments, argument{
617				ArgNum:         i + 1,
618				Type:           info.Types[arg].Type.String(),
619				UnderlyingType: info.Types[arg].Type.Underlying().String(),
620				Expr:           expr,
621				Value:          val,
622				Comment:        px.getComment(arg),
623				Position:       posString(&x.conf, info.Pkg, arg.Pos()),
624				// TODO report whether it implements
625				// interfaces plural.Interface,
626				// gender.Interface.
627			})
628		}
629	}
630	return arguments
631}
632
633func (px packageExtracter) addMessage(
634	pos token.Pos,
635	key []string,
636	c constant.Value,
637	comment string,
638	arguments []argument) {
639	x := px.x
640	fmtMsg := constant.StringVal(c)
641
642	ph := placeholders{index: map[string]string{}}
643
644	trimmed, _, _ := trimWS(fmtMsg)
645
646	p := fmtparser.Parser{}
647	simArgs := make([]interface{}, len(arguments))
648	for i, v := range arguments {
649		simArgs[i] = v
650	}
651	msg := ""
652	p.Reset(simArgs)
653	for p.SetFormat(trimmed); p.Scan(); {
654		name := ""
655		var arg *argument
656		switch p.Status {
657		case fmtparser.StatusText:
658			msg += p.Text()
659			continue
660		case fmtparser.StatusSubstitution,
661			fmtparser.StatusBadWidthSubstitution,
662			fmtparser.StatusBadPrecSubstitution:
663			arguments[p.ArgNum-1].used = true
664			arg = &arguments[p.ArgNum-1]
665			name = getID(arg)
666		case fmtparser.StatusBadArgNum, fmtparser.StatusMissingArg:
667			arg = &argument{
668				ArgNum:   p.ArgNum,
669				Position: posString(&x.conf, px.info.Pkg, pos),
670			}
671			name, arg.UnderlyingType = verbToPlaceholder(p.Text(), p.ArgNum)
672		}
673		sub := p.Text()
674		if !p.HasIndex {
675			r, sz := utf8.DecodeLastRuneInString(sub)
676			sub = fmt.Sprintf("%s[%d]%c", sub[:len(sub)-sz], p.ArgNum, r)
677		}
678		msg += fmt.Sprintf("{%s}", ph.addArg(arg, name, sub))
679	}
680	key = append(key, msg)
681
682	// Add additional Placeholders that can be used in translations
683	// that are not present in the string.
684	for _, arg := range arguments {
685		if arg.used {
686			continue
687		}
688		ph.addArg(&arg, getID(&arg), fmt.Sprintf("%%[%d]v", arg.ArgNum))
689	}
690
691	x.messages = append(x.messages, Message{
692		ID:      key,
693		Key:     fmtMsg,
694		Message: Text{Msg: msg},
695		// TODO(fix): this doesn't get the before comment.
696		Comment:      comment,
697		Placeholders: ph.slice,
698		Position:     posString(&x.conf, px.info.Pkg, pos),
699	})
700}
701
702func posString(conf *loader.Config, pkg *types.Package, pos token.Pos) string {
703	p := conf.Fset.Position(pos)
704	file := fmt.Sprintf("%s:%d:%d", filepath.Base(p.Filename), p.Line, p.Column)
705	return filepath.Join(pkg.Path(), file)
706}
707
708func getID(arg *argument) string {
709	s := getLastComponent(arg.Expr)
710	s = strip(s)
711	s = strings.Replace(s, " ", "", -1)
712	// For small variable names, use user-defined types for more info.
713	if len(s) <= 2 && arg.UnderlyingType != arg.Type {
714		s = getLastComponent(arg.Type)
715	}
716	return strings.Title(s)
717}
718
719// strip is a dirty hack to convert function calls to placeholder IDs.
720func strip(s string) string {
721	s = strings.Map(func(r rune) rune {
722		if unicode.IsSpace(r) || r == '-' {
723			return '_'
724		}
725		if !unicode.In(r, unicode.Letter, unicode.Mark, unicode.Number) {
726			return -1
727		}
728		return r
729	}, s)
730	// Strip "Get" from getter functions.
731	if strings.HasPrefix(s, "Get") || strings.HasPrefix(s, "get") {
732		if len(s) > len("get") {
733			r, _ := utf8.DecodeRuneInString(s)
734			if !unicode.In(r, unicode.Ll, unicode.M) { // not lower or mark
735				s = s[len("get"):]
736			}
737		}
738	}
739	return s
740}
741
742// verbToPlaceholder gives a name for a placeholder based on the substitution
743// verb. This is only to be used if there is otherwise no other type information
744// available.
745func verbToPlaceholder(sub string, pos int) (name, underlying string) {
746	r, _ := utf8.DecodeLastRuneInString(sub)
747	name = fmt.Sprintf("Arg_%d", pos)
748	switch r {
749	case 's', 'q':
750		underlying = "string"
751	case 'd':
752		name = "Integer"
753		underlying = "int"
754	case 'e', 'f', 'g':
755		name = "Number"
756		underlying = "float64"
757	case 'm':
758		name = "Message"
759		underlying = "string"
760	default:
761		underlying = "interface{}"
762	}
763	return name, underlying
764}
765
766type placeholders struct {
767	index map[string]string
768	slice []Placeholder
769}
770
771func (p *placeholders) addArg(arg *argument, name, sub string) (id string) {
772	id = name
773	alt, ok := p.index[id]
774	for i := 1; ok && alt != sub; i++ {
775		id = fmt.Sprintf("%s_%d", name, i)
776		alt, ok = p.index[id]
777	}
778	p.index[id] = sub
779	p.slice = append(p.slice, Placeholder{
780		ID:             id,
781		String:         sub,
782		Type:           arg.Type,
783		UnderlyingType: arg.UnderlyingType,
784		ArgNum:         arg.ArgNum,
785		Expr:           arg.Expr,
786		Comment:        arg.Comment,
787	})
788	return id
789}
790
791func getLastComponent(s string) string {
792	return s[1+strings.LastIndexByte(s, '.'):]
793}
794
795// isMsg returns whether s should be translated.
796func isMsg(s string) bool {
797	// TODO: parse as format string and omit strings that contain letters
798	// coming from format verbs.
799	for _, r := range s {
800		if unicode.In(r, unicode.L) {
801			return true
802		}
803	}
804	return false
805}
806