1 /*
2    This file is part of the BOLT-LMM linear mixed model software package
3    developed by Po-Ru Loh.  Copyright (C) 2014-2019 Harvard University.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #ifndef COVARIATEBASIS_HPP
20 #define COVARIATEBASIS_HPP
21 
22 #include <vector>
23 #include <string>
24 #include <utility>
25 #include <boost/utility.hpp>
26 
27 #include "DataMatrix.hpp"
28 
29 namespace LMM {
30 
31   class CovariateBasis : boost::noncopyable {
32 
33   private:
34     uint64 C; // number of covariates (including all-1s) chosen from DataMatrix of covariates
35     uint64 Cindep; // number of independent covariates
36     uint64 Nstride; // minor dimension of maskedCovars and basis arrays (for data layout)
37     uint64 Nused; // popcnt(maskIndivs)
38 
39     // beyond input _maskIndivs, samples with missing covariates are masked if !covarUseMissingIndic
40     double *maskIndivs; // [VECTOR]: 0-1 vector of length Nstride
41     double *basis; // [[MATRIX]]: C x Nstride; normalized to have vector norm 1 (unlike snps)
42     double *basisExtAllIndivs; // [[MATRIX]]: Cindep x Nstride; basis "extended" for OOS prediction
43 
44   public:
45     static const double MIN_REL_SINGULAR_VALUE;
46 
47     CovariateBasis(const DataMatrix &covarDataT, const double _maskIndivs[],
48 		   const std::vector < std::pair <std::string, DataMatrix::ValueType> > &covars,
49 		   int covarMaxLevels, bool covarUseMissingIndic);
50 
51     /**
52      * input:
53      * - covarDataT is assumed to contain the all-1s vector as its first row (indexed 0)
54      * - _maskIndivs has dimension >= covarDataT.ncols = N (if > N, additional cols are masked)
55      *   - assumed to be a subset of maskIndivs from original SnpData instance
56      *   - (presumably obtained by using SnpData.writeMaskIndivs and taking a subset)
57      * action:
58      * - copies _maskIndivs into member maskIndivs (and optionally performs missing masking)
59      * - builds masked copy of selected covariates (from covarDataT) in maskedCovars[C x Nstride]
60      *   - (possible later use: DGELS to get least squares coeffs wrt original covars)
61      * - mean-fills missing covariate values (using mean of non-masked, non-missing)
62      * - computes SVD; stores in basis[C x Nstride] and sets Cindep (warns if < C)
63      */
64     void init(const DataMatrix &covarDataT, const double _maskIndivs[],
65 	      std::vector < std::pair <std::string, DataMatrix::ValueType> > covars,
66 	      int covarMaxLevels, bool covarUseMissingIndic);
67 
68     ~CovariateBasis();
69 
70     //uint64 getC(void) const { return C; } -- don't expose this!
71     uint64 getCindep(void) const;
72     const double *getMaskIndivs(void) const;
73     uint64 getNused(void) const;
74     const double *getBasis(bool extAllIndivs) const;
75 
76     // vec is assumed to have dimension Nstride
77     // don't assume memory alignment (no SSE)
78     void applyMaskIndivs(double vec[]) const;
79 
80     /**
81      * Cindep values will be written to out[]
82      * vec[] has size Nstride
83      */
84     void computeCindepComponents(double out[], const double vec[]) const;
85 
86     /**
87      * vec[] has size Nstride
88      * don't assume memory alignment (no SSE)
89      */
90     void projectCovars(double vec[]) const;
91 
92     // debugging function
93     void printProj(const double vec[], const char name[]) const;
94 
95   };
96 }
97 
98 #endif
99