1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #ifdef SW2_BUILD_WITH_CUDA
23
24 # include <algorithm>
25 # include <cuda_runtime.h>
26 # include <stdio.h>
27 # include <stdlib.h>
28
29 # include <U2Core/Log.h>
30
31 # include "SmithWatermanAlgorithm.h"
32 # include "sw_cuda_cpp.h"
33
34 typedef int ScoreType;
35
36 extern QList<resType> calculateOnGPU(const char *seqLib, int seqLibLength, ScoreType *queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView);
37
38 static U2::Logger u2log("Smith Waterman CUDA");
39
launch(const char * seqLib,int seqLibLength,ScoreType * queryProfile,ScoreType qProfLen,int queryLength,ScoreType gapOpen,ScoreType gapExtension,ScoreType maxScore,U2::SmithWatermanSettings::SWResultView resultView)40 QList<resType> sw_cuda_cpp::launch(const char *seqLib, int seqLibLength, ScoreType *queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) {
41 return calculateOnGPU(seqLib, seqLibLength, queryProfile, qProfLen, queryLength, gapOpen, gapExtension, maxScore, resultView);
42 }
43
44 // TODO: calculate maximum alignment length
calcOverlap(int queryLength)45 int calcOverlap(int queryLength) {
46 return queryLength * 3;
47 }
48
49 // number of parts of the sequence which we divide
calcPartsNumber(int seqLibLength,int overlapLength)50 int calcPartsNumber(int seqLibLength, int overlapLength) {
51 int partsNumber = (seqLibLength + overlapLength - 1) / overlapLength;
52
53 if (partsNumber > sw_cuda_cpp::MAX_BLOCKS_NUMBER) {
54 partsNumber = sw_cuda_cpp::MAX_BLOCKS_NUMBER;
55 }
56 return partsNumber;
57 }
58
59 // size of sequence's part
calcPartSeqSize(int seqLibLength,int overlapLength,int partsNumber)60 int calcPartSeqSize(int seqLibLength, int overlapLength, int partsNumber) {
61 return (seqLibLength + (partsNumber - 1) * (overlapLength + 1)) / partsNumber;
62 }
63
64 // size of vector that contain all results
calcSizeRow(int,int,int partsNumber,int partSeqSize)65 int calcSizeRow(int, int, int partsNumber, int partSeqSize) {
66 return (partSeqSize + 1) * partsNumber;
67 }
68
estimateNeededGpuMemory(int seqLibLength,ScoreType qProfLen,int queryLength,const U2::SmithWatermanSettings::SWResultView resultView)69 quint64 sw_cuda_cpp::estimateNeededGpuMemory(int seqLibLength, ScoreType qProfLen, int queryLength, const U2::SmithWatermanSettings::SWResultView resultView) {
70 int sizeP = qProfLen * sizeof(ScoreType);
71 int sizeL = (seqLibLength) * sizeof(char);
72
73 const int overlapLength = calcOverlap(queryLength);
74 int partsNumber = calcPartsNumber(seqLibLength, overlapLength);
75 int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);
76 int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);
77
78 int sizeN = 7 * sizeRow * sizeof(ScoreType);
79
80 int directionMatrixSize = 0;
81 int backtraceBeginsSize = 0;
82 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
83 directionMatrixSize = sizeof(int) * queryLength * seqLibLength;
84 backtraceBeginsSize = sizeof(int) * sizeRow * 2;
85 }
86
87 quint64 memToAlloc = sizeL + sizeP + sizeN; // see cudaMallocs in sw_cuda.cu for details
88 return memToAlloc * 1.2; // just for safety
89 }
90
estimateNeededRamAmount(int seqLibLength,ScoreType,int queryLength,const U2::SmithWatermanSettings::SWResultView resultView)91 quint64 sw_cuda_cpp::estimateNeededRamAmount(int seqLibLength, ScoreType, int queryLength, const U2::SmithWatermanSettings::SWResultView resultView) {
92 const int overlapLength = calcOverlap(queryLength);
93 const int partsNumber = calcPartsNumber(seqLibLength, overlapLength);
94 const int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);
95 const int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);
96
97 int directionMatrixSize = 0;
98 int backtraceBeginsSize = 0;
99 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
100 directionMatrixSize = sizeof(int) * queryLength * seqLibLength;
101 backtraceBeginsSize = sizeof(int) * sizeRow * 2;
102 }
103
104 const quint64 memToAlloc = 3 * sizeRow * sizeof(ScoreType) + directionMatrixSize + backtraceBeginsSize;
105 return memToAlloc;
106 }
107
108 // IMPORTANT: these settings depend on the video card
109 // TODO: develop logic for calculation this settings
110 const int sw_cuda_cpp::MAX_BLOCKS_NUMBER = 14;
111 // we have 3 shared vector, this mean all shared memory = MAX_SHARED_VECTOR_LENGTH * 3
112 const int sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH = 128;
113
114 //__global__
115 extern void calculateMatrix_wrap(int blockSize, int threadNum, const char *seqLib, ScoreType *queryProfile, ScoreType *g_HdataUp, ScoreType *g_HdataRec, ScoreType *g_HdataMax, ScoreType *g_FdataUp, ScoreType *g_directionsUp, ScoreType *g_directionsRec, ScoreType *g_directionsMax, int iteration, int *g_directionsMatrix, int *g_backtraceBegins);
116
117 extern void setConstants(int partSeqSize, int partsNumber, int overlapLength, int seqLibLength, int queryLength, int gapOpen, int gapExtension, int maxScore, int queryPartLength, char upSymbolDirectMatrix, char leftSymbolDirectMatrix, char diagSymbolDirectMatrix, char stopSymbolDirectMatrix);
118
calculateOnGPU(const char * seqLib,int seqLibLength,ScoreType * queryProfile,ScoreType qProfLen,int queryLength,ScoreType gapOpen,ScoreType gapExtension,ScoreType maxScore,U2::SmithWatermanSettings::SWResultView resultView)119 QList<resType> calculateOnGPU(const char *seqLib, int seqLibLength, ScoreType *queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) {
120 QList<resType> pas;
121 if (seqLibLength < queryLength) {
122 u2log.details(QObject::tr("Pattern length (%1) is longer than search sequence length (%2).").arg(queryLength).
123 arg(seqLibLength));
124 return pas;
125 }
126
127 // TODO: calculate maximum alignment length
128 const int overlapLength = calcOverlap(queryLength);
129
130 int partsNumber = calcPartsNumber(seqLibLength, overlapLength);
131
132 int queryDevider = 1;
133 if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) {
134 queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH;
135 }
136
137 int partQuerySize = (queryLength + queryDevider - 1) / queryDevider;
138
139 int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);
140
141 int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);
142
143 u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider));
144
145 u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength));
146 u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize));
147
148 //************************** declare some temp variables on host
149
150 ScoreType *tempRow = new ScoreType[sizeRow];
151 ScoreType *zerroArr = new ScoreType[sizeRow];
152 for (int i = 0; i < sizeRow; i++) {
153 zerroArr[i] = 0;
154 }
155
156 ScoreType *directionRow = new ScoreType[sizeRow];
157
158 size_t directionMatrixSize = 0;
159 size_t backtraceBeginsSize = 0;
160 int *globalMatrix = nullptr;
161 int *backtraceBegins = nullptr;
162 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
163 directionMatrixSize = seqLibLength * queryLength * sizeof(int);
164 backtraceBeginsSize = 2 * sizeRow * sizeof(int);
165
166 globalMatrix = new int[directionMatrixSize / sizeof(int)];
167 backtraceBegins = new int[backtraceBeginsSize / sizeof(int)];
168
169 memset(globalMatrix, 0, directionMatrixSize);
170 memset(backtraceBegins, 0, backtraceBeginsSize);
171 }
172 //************************** sizes of arrays
173
174 size_t sizeQ = sizeRow * sizeof(ScoreType);
175 size_t sizeQQ = (sizeRow) * sizeof(ScoreType);
176 size_t sizeP = qProfLen * sizeof(ScoreType);
177 size_t sizeL = (seqLibLength) * sizeof(char);
178
179 //************************** declare arrays on device
180
181 char *g_seqLib = nullptr;
182 ScoreType *g_queryProfile = nullptr;
183 ScoreType *g_HdataMax = nullptr;
184 ScoreType *g_HdataUp = nullptr;
185 ScoreType *g_HdataRec = nullptr;
186 ScoreType *g_HdataTmp;
187 ScoreType *g_FdataUp = nullptr;
188 ScoreType *g_directionsUp = nullptr;
189 ScoreType *g_directionsMax = nullptr;
190 ScoreType *g_directionsRec = nullptr;
191 int *g_directionsMatrix = nullptr;
192 int *g_backtraceBegins = nullptr;
193
194 //************************** allocate global memory on device
195
196 QList<cudaError> errors = { cudaMalloc((void **)&g_seqLib, sizeL),
197 cudaMalloc((void **)&g_queryProfile, sizeP),
198 cudaMalloc((void **)&g_HdataMax, sizeQ),
199 cudaMalloc((void **)&g_HdataUp, sizeQ),
200 cudaMalloc((void **)&g_FdataUp, sizeQ),
201 cudaMalloc((void **)&g_directionsUp, sizeQ),
202 cudaMalloc((void **)&g_directionsMax, sizeQ),
203 cudaMalloc((void **)&g_HdataRec, sizeQ),
204 cudaMalloc((void **)&g_directionsRec, sizeQ) };
205
206 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
207 cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize);
208 cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize);
209 errors << errorMatrix << errorBacktrace;
210 }
211
212 if (std::find_if(errors.constBegin(), errors.constEnd(), [](const cudaError error) {
213 return error != cudaError::cudaSuccess; }) != errors.constEnd()) {
214 u2log.error(QObject::tr("CUDA malloc error"));
215 cudaFree(g_seqLib);
216 cudaFree(g_queryProfile);
217 cudaFree(g_HdataMax);
218 cudaFree(g_HdataUp);
219 cudaFree(g_FdataUp);
220 cudaFree(g_directionsUp);
221 cudaFree(g_directionsMax);
222 cudaFree(g_HdataRec);
223 cudaFree(g_directionsRec);
224 cudaFree(g_directionsMatrix);
225 cudaFree(g_backtraceBegins);
226 delete[] tempRow;
227 delete[] directionRow;
228 delete[] zerroArr;
229 delete[] globalMatrix;
230 delete[] backtraceBegins;
231
232 return pas;
233 }
234
235 u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7 + directionMatrixSize + backtraceBeginsSize) / 1024));
236
237 //************************** copy from host to device
238
239 cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice);
240 cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice);
241 cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
242 cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
243 cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
244 cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
245 cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
246 cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
247 cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
248 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
249 cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice);
250 cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice);
251 }
252
253 //************************** start calculation
254
255 int BLOCK_SIZE = partsNumber;
256
257 dim3 dimBlock(BLOCK_SIZE);
258 dim3 dimGrid(partQuerySize);
259
260 // move constants variables to constant cuda memory
261 setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength, queryLength, gapOpen, gapExtension, maxScore, partQuerySize, U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG, U2::SmithWatermanAlgorithm::STOP);
262
263 size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
264 u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
265 // start main loop
266 for (int i = 0; i < queryDevider; i++) {
267 calculateMatrix_wrap(dimBlock.x, dimGrid.x, g_seqLib, g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax, g_FdataUp, g_directionsUp, g_directionsRec, g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);
268
269 cudaError hasErrors = cudaThreadSynchronize();
270
271 if (hasErrors != 0) {
272 u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
273 }
274
275 // revert arrays
276 g_HdataTmp = g_HdataRec;
277 g_HdataRec = g_HdataUp;
278 g_HdataUp = g_HdataTmp;
279
280 g_HdataTmp = g_directionsRec;
281 g_directionsRec = g_directionsUp;
282 g_directionsUp = g_HdataTmp;
283 }
284
285 // Copy vectors on host and find actual results
286 cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
287 cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
288 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
289 cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
290 cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
291 }
292
293 resType res;
294 for (int j = 0; j < (sizeRow); j++) {
295 if (tempRow[j] >= maxScore) {
296 res.refSubseq.startPos = directionRow[j];
297 res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
298 res.score = tempRow[j];
299 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
300 qint32 pairAlignOffset = 0;
301
302 qint32 row = backtraceBegins[2 * j];
303 qint32 column = backtraceBegins[2 * j + 1];
304 while (U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
305 if (U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
306 res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
307 row--;
308 column--;
309 } else if (U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
310 res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
311 column--;
312 } else if (U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
313 res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
314 row--;
315 }
316 if (0 >= row || 0 >= column) {
317 break;
318 }
319 }
320 res.patternSubseq.startPos = row;
321 res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
322 }
323
324 pas.append(res);
325 }
326 }
327
328 // deallocation memory
329 cudaFree(g_seqLib);
330 cudaFree(g_queryProfile);
331 cudaFree(g_HdataMax);
332 cudaFree(g_HdataUp);
333 cudaFree(g_HdataRec);
334 cudaFree(g_FdataUp);
335 cudaFree(g_directionsUp);
336 cudaFree(g_directionsMax);
337 cudaFree(g_directionsRec);
338
339 if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
340 cudaFree(g_directionsMatrix);
341 cudaFree(g_backtraceBegins);
342 }
343
344 delete[] tempRow;
345 delete[] directionRow;
346 delete[] zerroArr;
347 delete[] globalMatrix;
348 delete[] backtraceBegins;
349
350 return pas;
351 }
352
353 #endif // SW2_BUILD_WITH_CUDA
354