gonum/stat/cca_example_test.go

// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package stat_test

import (
	"fmt"
	"log"

	"gonum.org/v1/gonum/floats"
	"gonum.org/v1/gonum/mat"
	"gonum.org/v1/gonum/stat"
)

// symView is a helper for getting a View of a SymDense.
type symView struct {
	sym *mat.SymDense

	i, j, r, c int
}

func (s symView) Dims() (r, c int) { return s.r, s.c }

func (s symView) At(i, j int) float64 {
	if i < 0 || s.r <= i {
		panic("i out of bounds")
	}
	if j < 0 || s.c <= j {
		panic("j out of bounds")
	}
	return s.sym.At(s.i+i, s.j+j)
}

func (s symView) T() mat.Matrix { return mat.Transpose{Matrix: s} }

func ExampleCC() {
	// This example is directly analogous to Example 3.5 on page 87 of
	// Koch, Inge. Analysis of multivariate and high-dimensional data.
	// Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939

	// bostonData is the Boston Housing Data of Harrison and Rubinfeld (1978)
	n, _ := bostonData.Dims()
	var xd, yd = 7, 4
	// The variables (columns) of bostonData can be partitioned into two sets:
	// those that deal with environmental/social variables (xdata), and those
	// that contain information regarding the individual (ydata). Because the
	// variables can be naturally partitioned in this way, these data are
	// appropriate for canonical correlation analysis. The columns (variables)
	// of xdata are, in order:
	//  per capita crime rate by town,
	//  proportion of non-retail business acres per town,
	//  nitric oxide concentration (parts per 10 million),
	//  weighted distances to Boston employment centres,
	//  index of accessibility to radial highways,
	//  pupil-teacher ratio by town, and
	//  proportion of blacks by town.
	xdata := bostonData.Slice(0, n, 0, xd)

	// The columns (variables) of ydata are, in order:
	//  average number of rooms per dwelling,
	//  proportion of owner-occupied units built prior to 1940,
	//  full-value property-tax rate per $10000, and
	//  median value of owner-occupied homes in $1000s.
	ydata := bostonData.Slice(0, n, xd, xd+yd)

	// For comparison, calculate the correlation matrix for the original data.
	var cor mat.SymDense
	stat.CorrelationMatrix(&cor, bostonData, nil)

	// Extract just those correlations that are between xdata and ydata.
	var corRaw = symView{sym: &cor, i: 0, j: xd, r: xd, c: yd}

	// Note that the strongest correlation between individual variables is 0.91
	// between the 5th variable of xdata (index of accessibility to radial
	// highways) and the 3rd variable of ydata (full-value property-tax rate per
	// $10000).
	fmt.Printf("corRaw = %.4f", mat.Formatted(corRaw, mat.Prefix("         ")))

	// Calculate the canonical correlations.
	var cc stat.CC
	err := cc.CanonicalCorrelations(xdata, ydata, nil)
	if err != nil {
		log.Fatal(err)
	}

	// Unpack cc.
	var pVecs, qVecs, phiVs, psiVs mat.Dense
	ccors := cc.CorrsTo(nil)
	cc.LeftTo(&pVecs, true)
	cc.RightTo(&qVecs, true)
	cc.LeftTo(&phiVs, false)
	cc.RightTo(&psiVs, false)

	// Canonical Correlation Matrix, or the correlations between the sphered
	// data.
	var corSph mat.Dense
	corSph.CloneFrom(&pVecs)
	col := make([]float64, xd)
	for j := 0; j < yd; j++ {
		mat.Col(col, j, &corSph)
		floats.Scale(ccors[j], col)
		corSph.SetCol(j, col)
	}
	corSph.Product(&corSph, qVecs.T())
	fmt.Printf("\n\ncorSph = %.4f", mat.Formatted(&corSph, mat.Prefix("         ")))

	// Canonical Correlations. Note that the first canonical correlation is
	// 0.95, stronger than the greatest correlation in the original data, and
	// much stronger than the greatest correlation in the sphered data.
	fmt.Printf("\n\nccors = %.4f", ccors)

	// Left and right eigenvectors of the canonical correlation matrix.
	fmt.Printf("\n\npVecs = %.4f", mat.Formatted(&pVecs, mat.Prefix("        ")))
	fmt.Printf("\n\nqVecs = %.4f", mat.Formatted(&qVecs, mat.Prefix("        ")))

	// Canonical Correlation Transforms. These can be useful as they represent
	// the canonical variables as linear combinations of the original variables.
	fmt.Printf("\n\nphiVs = %.4f", mat.Formatted(&phiVs, mat.Prefix("        ")))
	fmt.Printf("\n\npsiVs = %.4f", mat.Formatted(&psiVs, mat.Prefix("        ")))

	// Output:
	// corRaw = ⎡-0.2192   0.3527   0.5828  -0.3883⎤
	//          ⎢-0.3917   0.6448   0.7208  -0.4837⎥
	//          ⎢-0.3022   0.7315   0.6680  -0.4273⎥
	//          ⎢ 0.2052  -0.7479  -0.5344   0.2499⎥
	//          ⎢-0.2098   0.4560   0.9102  -0.3816⎥
	//          ⎢-0.3555   0.2615   0.4609  -0.5078⎥
	//          ⎣ 0.1281  -0.2735  -0.4418   0.3335⎦
	//
	// corSph = ⎡ 0.0118   0.0525   0.2300  -0.1363⎤
	//          ⎢-0.1810   0.3213   0.3814  -0.1412⎥
	//          ⎢ 0.0166   0.2241   0.0104  -0.2235⎥
	//          ⎢ 0.0346  -0.5481  -0.0034  -0.1994⎥
	//          ⎢ 0.0303  -0.0956   0.7152   0.2039⎥
	//          ⎢-0.0298  -0.0022   0.0739  -0.3703⎥
	//          ⎣-0.1226  -0.0746  -0.3899   0.1541⎦
	//
	// ccors = [0.9451 0.6787 0.5714 0.2010]
	//
	// pVecs = ⎡-0.2574   0.0158   0.2122  -0.0946⎤
	//         ⎢-0.4837   0.3837   0.1474   0.6597⎥
	//         ⎢-0.0801   0.3494   0.3287  -0.2862⎥
	//         ⎢ 0.1278  -0.7337   0.4851   0.2248⎥
	//         ⎢-0.6969  -0.4342  -0.3603   0.0291⎥
	//         ⎢-0.0991   0.0503   0.6384   0.1022⎥
	//         ⎣ 0.4260   0.0323  -0.2290   0.6419⎦
	//
	// qVecs = ⎡ 0.0182  -0.1583  -0.0067  -0.9872⎤
	//         ⎢-0.2348   0.9483  -0.1462  -0.1554⎥
	//         ⎢-0.9701  -0.2406  -0.0252   0.0209⎥
	//         ⎣ 0.0593  -0.1330  -0.9889   0.0291⎦
	//
	// phiVs = ⎡-0.0027   0.0093   0.0490  -0.0155⎤
	//         ⎢-0.0429  -0.0242   0.0361   0.1839⎥
	//         ⎢-1.2248   5.6031   5.8094  -4.7927⎥
	//         ⎢-0.0044  -0.3424   0.4470   0.1150⎥
	//         ⎢-0.0742  -0.1193  -0.1116   0.0022⎥
	//         ⎢-0.0233   0.1046   0.3853  -0.0161⎥
	//         ⎣ 0.0001   0.0005  -0.0030   0.0082⎦
	//
	// psiVs = ⎡ 0.0302  -0.3002   0.0878  -1.9583⎤
	//         ⎢-0.0065   0.0392  -0.0118  -0.0061⎥
	//         ⎢-0.0052  -0.0046  -0.0023   0.0008⎥
	//         ⎣ 0.0020   0.0037  -0.1293   0.1038⎦
}