1// Copyright ©2016 The Gonum Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package stat_test
6
7import (
8	"fmt"
9	"log"
10
11	"gonum.org/v1/gonum/floats"
12	"gonum.org/v1/gonum/mat"
13	"gonum.org/v1/gonum/stat"
14)
15
16// symView is a helper for getting a View of a SymDense.
17type symView struct {
18	sym *mat.SymDense
19
20	i, j, r, c int
21}
22
23func (s symView) Dims() (r, c int) { return s.r, s.c }
24
25func (s symView) At(i, j int) float64 {
26	if i < 0 || s.r <= i {
27		panic("i out of bounds")
28	}
29	if j < 0 || s.c <= j {
30		panic("j out of bounds")
31	}
32	return s.sym.At(s.i+i, s.j+j)
33}
34
35func (s symView) T() mat.Matrix { return mat.Transpose{Matrix: s} }
36
37func ExampleCC() {
38	// This example is directly analogous to Example 3.5 on page 87 of
39	// Koch, Inge. Analysis of multivariate and high-dimensional data.
40	// Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939
41
42	// bostonData is the Boston Housing Data of Harrison and Rubinfeld (1978)
43	n, _ := bostonData.Dims()
44	var xd, yd = 7, 4
45	// The variables (columns) of bostonData can be partitioned into two sets:
46	// those that deal with environmental/social variables (xdata), and those
47	// that contain information regarding the individual (ydata). Because the
48	// variables can be naturally partitioned in this way, these data are
49	// appropriate for canonical correlation analysis. The columns (variables)
50	// of xdata are, in order:
51	//  per capita crime rate by town,
52	//  proportion of non-retail business acres per town,
53	//  nitric oxide concentration (parts per 10 million),
54	//  weighted distances to Boston employment centres,
55	//  index of accessibility to radial highways,
56	//  pupil-teacher ratio by town, and
57	//  proportion of blacks by town.
58	xdata := bostonData.Slice(0, n, 0, xd)
59
60	// The columns (variables) of ydata are, in order:
61	//  average number of rooms per dwelling,
62	//  proportion of owner-occupied units built prior to 1940,
63	//  full-value property-tax rate per $10000, and
64	//  median value of owner-occupied homes in $1000s.
65	ydata := bostonData.Slice(0, n, xd, xd+yd)
66
67	// For comparison, calculate the correlation matrix for the original data.
68	var cor mat.SymDense
69	stat.CorrelationMatrix(&cor, bostonData, nil)
70
71	// Extract just those correlations that are between xdata and ydata.
72	var corRaw = symView{sym: &cor, i: 0, j: xd, r: xd, c: yd}
73
74	// Note that the strongest correlation between individual variables is 0.91
75	// between the 5th variable of xdata (index of accessibility to radial
76	// highways) and the 3rd variable of ydata (full-value property-tax rate per
77	// $10000).
78	fmt.Printf("corRaw = %.4f", mat.Formatted(corRaw, mat.Prefix("         ")))
79
80	// Calculate the canonical correlations.
81	var cc stat.CC
82	err := cc.CanonicalCorrelations(xdata, ydata, nil)
83	if err != nil {
84		log.Fatal(err)
85	}
86
87	// Unpack cc.
88	var pVecs, qVecs, phiVs, psiVs mat.Dense
89	ccors := cc.CorrsTo(nil)
90	cc.LeftTo(&pVecs, true)
91	cc.RightTo(&qVecs, true)
92	cc.LeftTo(&phiVs, false)
93	cc.RightTo(&psiVs, false)
94
95	// Canonical Correlation Matrix, or the correlations between the sphered
96	// data.
97	var corSph mat.Dense
98	corSph.CloneFrom(&pVecs)
99	col := make([]float64, xd)
100	for j := 0; j < yd; j++ {
101		mat.Col(col, j, &corSph)
102		floats.Scale(ccors[j], col)
103		corSph.SetCol(j, col)
104	}
105	corSph.Product(&corSph, qVecs.T())
106	fmt.Printf("\n\ncorSph = %.4f", mat.Formatted(&corSph, mat.Prefix("         ")))
107
108	// Canonical Correlations. Note that the first canonical correlation is
109	// 0.95, stronger than the greatest correlation in the original data, and
110	// much stronger than the greatest correlation in the sphered data.
111	fmt.Printf("\n\nccors = %.4f", ccors)
112
113	// Left and right eigenvectors of the canonical correlation matrix.
114	fmt.Printf("\n\npVecs = %.4f", mat.Formatted(&pVecs, mat.Prefix("        ")))
115	fmt.Printf("\n\nqVecs = %.4f", mat.Formatted(&qVecs, mat.Prefix("        ")))
116
117	// Canonical Correlation Transforms. These can be useful as they represent
118	// the canonical variables as linear combinations of the original variables.
119	fmt.Printf("\n\nphiVs = %.4f", mat.Formatted(&phiVs, mat.Prefix("        ")))
120	fmt.Printf("\n\npsiVs = %.4f", mat.Formatted(&psiVs, mat.Prefix("        ")))
121
122	// Output:
123	// corRaw = ⎡-0.2192   0.3527   0.5828  -0.3883⎤
124	//          ⎢-0.3917   0.6448   0.7208  -0.4837⎥
125	//          ⎢-0.3022   0.7315   0.6680  -0.4273⎥
126	//          ⎢ 0.2052  -0.7479  -0.5344   0.2499⎥
127	//          ⎢-0.2098   0.4560   0.9102  -0.3816⎥
128	//          ⎢-0.3555   0.2615   0.4609  -0.5078⎥
129	//          ⎣ 0.1281  -0.2735  -0.4418   0.3335⎦
130	//
131	// corSph = ⎡ 0.0118   0.0525   0.2300  -0.1363⎤
132	//          ⎢-0.1810   0.3213   0.3814  -0.1412⎥
133	//          ⎢ 0.0166   0.2241   0.0104  -0.2235⎥
134	//          ⎢ 0.0346  -0.5481  -0.0034  -0.1994⎥
135	//          ⎢ 0.0303  -0.0956   0.7152   0.2039⎥
136	//          ⎢-0.0298  -0.0022   0.0739  -0.3703⎥
137	//          ⎣-0.1226  -0.0746  -0.3899   0.1541⎦
138	//
139	// ccors = [0.9451 0.6787 0.5714 0.2010]
140	//
141	// pVecs = ⎡-0.2574   0.0158   0.2122  -0.0946⎤
142	//         ⎢-0.4837   0.3837   0.1474   0.6597⎥
143	//         ⎢-0.0801   0.3494   0.3287  -0.2862⎥
144	//         ⎢ 0.1278  -0.7337   0.4851   0.2248⎥
145	//         ⎢-0.6969  -0.4342  -0.3603   0.0291⎥
146	//         ⎢-0.0991   0.0503   0.6384   0.1022⎥
147	//         ⎣ 0.4260   0.0323  -0.2290   0.6419⎦
148	//
149	// qVecs = ⎡ 0.0182  -0.1583  -0.0067  -0.9872⎤
150	//         ⎢-0.2348   0.9483  -0.1462  -0.1554⎥
151	//         ⎢-0.9701  -0.2406  -0.0252   0.0209⎥
152	//         ⎣ 0.0593  -0.1330  -0.9889   0.0291⎦
153	//
154	// phiVs = ⎡-0.0027   0.0093   0.0490  -0.0155⎤
155	//         ⎢-0.0429  -0.0242   0.0361   0.1839⎥
156	//         ⎢-1.2248   5.6031   5.8094  -4.7927⎥
157	//         ⎢-0.0044  -0.3424   0.4470   0.1150⎥
158	//         ⎢-0.0742  -0.1193  -0.1116   0.0022⎥
159	//         ⎢-0.0233   0.1046   0.3853  -0.0161⎥
160	//         ⎣ 0.0001   0.0005  -0.0030   0.0082⎦
161	//
162	// psiVs = ⎡ 0.0302  -0.3002   0.0878  -1.9583⎤
163	//         ⎢-0.0065   0.0392  -0.0118  -0.0061⎥
164	//         ⎢-0.0052  -0.0046  -0.0023   0.0008⎥
165	//         ⎣ 0.0020   0.0037  -0.1293   0.1038⎦
166}
167