1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #ifdef SW2_BUILD_WITH_CUDA
23 
24 #    include <algorithm>
25 #    include <cuda_runtime.h>
26 #    include <stdio.h>
27 #    include <stdlib.h>
28 
29 #    include <U2Core/Log.h>
30 
31 #    include "SmithWatermanAlgorithm.h"
32 #    include "sw_cuda_cpp.h"
33 
34 typedef int ScoreType;
35 
36 extern QList<resType> calculateOnGPU(const char *seqLib, int seqLibLength, ScoreType *queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView);
37 
38 static U2::Logger u2log("Smith Waterman CUDA");
39 
launch(const char * seqLib,int seqLibLength,ScoreType * queryProfile,ScoreType qProfLen,int queryLength,ScoreType gapOpen,ScoreType gapExtension,ScoreType maxScore,U2::SmithWatermanSettings::SWResultView resultView)40 QList<resType> sw_cuda_cpp::launch(const char *seqLib, int seqLibLength, ScoreType *queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) {
41     return calculateOnGPU(seqLib, seqLibLength, queryProfile, qProfLen, queryLength, gapOpen, gapExtension, maxScore, resultView);
42 }
43 
44 // TODO: calculate maximum alignment length
calcOverlap(int queryLength)45 int calcOverlap(int queryLength) {
46     return queryLength * 3;
47 }
48 
49 // number of parts of the sequence which we divide
calcPartsNumber(int seqLibLength,int overlapLength)50 int calcPartsNumber(int seqLibLength, int overlapLength) {
51     int partsNumber = (seqLibLength + overlapLength - 1) / overlapLength;
52 
53     if (partsNumber > sw_cuda_cpp::MAX_BLOCKS_NUMBER) {
54         partsNumber = sw_cuda_cpp::MAX_BLOCKS_NUMBER;
55     }
56     return partsNumber;
57 }
58 
59 // size of sequence's part
calcPartSeqSize(int seqLibLength,int overlapLength,int partsNumber)60 int calcPartSeqSize(int seqLibLength, int overlapLength, int partsNumber) {
61     return (seqLibLength + (partsNumber - 1) * (overlapLength + 1)) / partsNumber;
62 }
63 
64 // size of vector that contain all results
calcSizeRow(int,int,int partsNumber,int partSeqSize)65 int calcSizeRow(int, int, int partsNumber, int partSeqSize) {
66     return (partSeqSize + 1) * partsNumber;
67 }
68 
estimateNeededGpuMemory(int seqLibLength,ScoreType qProfLen,int queryLength,const U2::SmithWatermanSettings::SWResultView resultView)69 quint64 sw_cuda_cpp::estimateNeededGpuMemory(int seqLibLength, ScoreType qProfLen, int queryLength, const U2::SmithWatermanSettings::SWResultView resultView) {
70     int sizeP = qProfLen * sizeof(ScoreType);
71     int sizeL = (seqLibLength) * sizeof(char);
72 
73     const int overlapLength = calcOverlap(queryLength);
74     int partsNumber = calcPartsNumber(seqLibLength, overlapLength);
75     int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);
76     int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);
77 
78     int sizeN = 7 * sizeRow * sizeof(ScoreType);
79 
80     int directionMatrixSize = 0;
81     int backtraceBeginsSize = 0;
82     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
83         directionMatrixSize = sizeof(int) * queryLength * seqLibLength;
84         backtraceBeginsSize = sizeof(int) * sizeRow * 2;
85     }
86 
87     quint64 memToAlloc = sizeL + sizeP + sizeN;  // see cudaMallocs in sw_cuda.cu for details
88     return memToAlloc * 1.2;  // just for safety
89 }
90 
estimateNeededRamAmount(int seqLibLength,ScoreType,int queryLength,const U2::SmithWatermanSettings::SWResultView resultView)91 quint64 sw_cuda_cpp::estimateNeededRamAmount(int seqLibLength, ScoreType, int queryLength, const U2::SmithWatermanSettings::SWResultView resultView) {
92     const int overlapLength = calcOverlap(queryLength);
93     const int partsNumber = calcPartsNumber(seqLibLength, overlapLength);
94     const int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);
95     const int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);
96 
97     int directionMatrixSize = 0;
98     int backtraceBeginsSize = 0;
99     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
100         directionMatrixSize = sizeof(int) * queryLength * seqLibLength;
101         backtraceBeginsSize = sizeof(int) * sizeRow * 2;
102     }
103 
104     const quint64 memToAlloc = 3 * sizeRow * sizeof(ScoreType) + directionMatrixSize + backtraceBeginsSize;
105     return memToAlloc;
106 }
107 
108 // IMPORTANT: these settings depend on the video card
109 // TODO: develop logic for calculation this settings
110 const int sw_cuda_cpp::MAX_BLOCKS_NUMBER = 14;
111 // we have 3 shared vector, this mean all shared memory = MAX_SHARED_VECTOR_LENGTH * 3
112 const int sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH = 128;
113 
114 //__global__
115 extern void calculateMatrix_wrap(int blockSize, int threadNum, const char *seqLib, ScoreType *queryProfile, ScoreType *g_HdataUp, ScoreType *g_HdataRec, ScoreType *g_HdataMax, ScoreType *g_FdataUp, ScoreType *g_directionsUp, ScoreType *g_directionsRec, ScoreType *g_directionsMax, int iteration, int *g_directionsMatrix, int *g_backtraceBegins);
116 
117 extern void setConstants(int partSeqSize, int partsNumber, int overlapLength, int seqLibLength, int queryLength, int gapOpen, int gapExtension, int maxScore, int queryPartLength, char upSymbolDirectMatrix, char leftSymbolDirectMatrix, char diagSymbolDirectMatrix, char stopSymbolDirectMatrix);
118 
calculateOnGPU(const char * seqLib,int seqLibLength,ScoreType * queryProfile,ScoreType qProfLen,int queryLength,ScoreType gapOpen,ScoreType gapExtension,ScoreType maxScore,U2::SmithWatermanSettings::SWResultView resultView)119 QList<resType> calculateOnGPU(const char *seqLib, int seqLibLength, ScoreType *queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) {
120     QList<resType> pas;
121     if (seqLibLength < queryLength) {
122         u2log.details(QObject::tr("Pattern length (%1) is longer than search sequence length (%2).").arg(queryLength).
123                                                                                                      arg(seqLibLength));
124         return pas;
125     }
126 
127     // TODO: calculate maximum alignment length
128     const int overlapLength = calcOverlap(queryLength);
129 
130     int partsNumber = calcPartsNumber(seqLibLength, overlapLength);
131 
132     int queryDevider = 1;
133     if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) {
134         queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH;
135     }
136 
137     int partQuerySize = (queryLength + queryDevider - 1) / queryDevider;
138 
139     int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);
140 
141     int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);
142 
143     u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider));
144 
145     u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength));
146     u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize));
147 
148     //************************** declare some temp variables on host
149 
150     ScoreType *tempRow = new ScoreType[sizeRow];
151     ScoreType *zerroArr = new ScoreType[sizeRow];
152     for (int i = 0; i < sizeRow; i++) {
153         zerroArr[i] = 0;
154     }
155 
156     ScoreType *directionRow = new ScoreType[sizeRow];
157 
158     size_t directionMatrixSize = 0;
159     size_t backtraceBeginsSize = 0;
160     int *globalMatrix = nullptr;
161     int *backtraceBegins = nullptr;
162     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
163         directionMatrixSize = seqLibLength * queryLength * sizeof(int);
164         backtraceBeginsSize = 2 * sizeRow * sizeof(int);
165 
166         globalMatrix = new int[directionMatrixSize / sizeof(int)];
167         backtraceBegins = new int[backtraceBeginsSize / sizeof(int)];
168 
169         memset(globalMatrix, 0, directionMatrixSize);
170         memset(backtraceBegins, 0, backtraceBeginsSize);
171     }
172     //************************** sizes of arrays
173 
174     size_t sizeQ = sizeRow * sizeof(ScoreType);
175     size_t sizeQQ = (sizeRow) * sizeof(ScoreType);
176     size_t sizeP = qProfLen * sizeof(ScoreType);
177     size_t sizeL = (seqLibLength) * sizeof(char);
178 
179     //************************** declare arrays on device
180 
181     char      *g_seqLib           = nullptr;
182     ScoreType *g_queryProfile     = nullptr;
183     ScoreType *g_HdataMax         = nullptr;
184     ScoreType *g_HdataUp          = nullptr;
185     ScoreType *g_HdataRec         = nullptr;
186     ScoreType *g_HdataTmp;
187     ScoreType *g_FdataUp          = nullptr;
188     ScoreType *g_directionsUp     = nullptr;
189     ScoreType *g_directionsMax    = nullptr;
190     ScoreType *g_directionsRec    = nullptr;
191     int       *g_directionsMatrix = nullptr;
192     int       *g_backtraceBegins  = nullptr;
193 
194     //************************** allocate global memory on device
195 
196     QList<cudaError> errors = { cudaMalloc((void **)&g_seqLib, sizeL),
197                                 cudaMalloc((void **)&g_queryProfile, sizeP),
198                                 cudaMalloc((void **)&g_HdataMax, sizeQ),
199                                 cudaMalloc((void **)&g_HdataUp, sizeQ),
200                                 cudaMalloc((void **)&g_FdataUp, sizeQ),
201                                 cudaMalloc((void **)&g_directionsUp, sizeQ),
202                                 cudaMalloc((void **)&g_directionsMax, sizeQ),
203                                 cudaMalloc((void **)&g_HdataRec, sizeQ),
204                                 cudaMalloc((void **)&g_directionsRec, sizeQ) };
205 
206     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
207         cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize);
208         cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize);
209         errors << errorMatrix << errorBacktrace;
210     }
211 
212     if (std::find_if(errors.constBegin(), errors.constEnd(), [](const cudaError error) {
213             return error != cudaError::cudaSuccess; }) != errors.constEnd()) {
214         u2log.error(QObject::tr("CUDA malloc error"));
215         cudaFree(g_seqLib);
216         cudaFree(g_queryProfile);
217         cudaFree(g_HdataMax);
218         cudaFree(g_HdataUp);
219         cudaFree(g_FdataUp);
220         cudaFree(g_directionsUp);
221         cudaFree(g_directionsMax);
222         cudaFree(g_HdataRec);
223         cudaFree(g_directionsRec);
224         cudaFree(g_directionsMatrix);
225         cudaFree(g_backtraceBegins);
226         delete[] tempRow;
227         delete[] directionRow;
228         delete[] zerroArr;
229         delete[] globalMatrix;
230         delete[] backtraceBegins;
231 
232         return pas;
233     }
234 
235     u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7 + directionMatrixSize + backtraceBeginsSize) / 1024));
236 
237     //************************** copy from host to device
238 
239     cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice);
240     cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice);
241     cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
242     cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
243     cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
244     cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
245     cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
246     cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
247     cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
248     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
249         cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice);
250         cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice);
251     }
252 
253     //************************** start calculation
254 
255     int BLOCK_SIZE = partsNumber;
256 
257     dim3 dimBlock(BLOCK_SIZE);
258     dim3 dimGrid(partQuerySize);
259 
260     // move constants variables to constant cuda memory
261     setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength, queryLength, gapOpen, gapExtension, maxScore, partQuerySize, U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG, U2::SmithWatermanAlgorithm::STOP);
262 
263     size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
264     u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
265     // start main loop
266     for (int i = 0; i < queryDevider; i++) {
267         calculateMatrix_wrap(dimBlock.x, dimGrid.x, g_seqLib, g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax, g_FdataUp, g_directionsUp, g_directionsRec, g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);
268 
269         cudaError hasErrors = cudaThreadSynchronize();
270 
271         if (hasErrors != 0) {
272             u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
273         }
274 
275         // revert arrays
276         g_HdataTmp = g_HdataRec;
277         g_HdataRec = g_HdataUp;
278         g_HdataUp = g_HdataTmp;
279 
280         g_HdataTmp = g_directionsRec;
281         g_directionsRec = g_directionsUp;
282         g_directionsUp = g_HdataTmp;
283     }
284 
285     // Copy vectors on host and find actual results
286     cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
287     cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
288     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
289         cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
290         cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
291     }
292 
293     resType res;
294     for (int j = 0; j < (sizeRow); j++) {
295         if (tempRow[j] >= maxScore) {
296             res.refSubseq.startPos = directionRow[j];
297             res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
298             res.score = tempRow[j];
299             if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
300                 qint32 pairAlignOffset = 0;
301 
302                 qint32 row = backtraceBegins[2 * j];
303                 qint32 column = backtraceBegins[2 * j + 1];
304                 while (U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
305                     if (U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
306                         res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
307                         row--;
308                         column--;
309                     } else if (U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
310                         res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
311                         column--;
312                     } else if (U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
313                         res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
314                         row--;
315                     }
316                     if (0 >= row || 0 >= column) {
317                         break;
318                     }
319                 }
320                 res.patternSubseq.startPos = row;
321                 res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
322             }
323 
324             pas.append(res);
325         }
326     }
327 
328     // deallocation memory
329     cudaFree(g_seqLib);
330     cudaFree(g_queryProfile);
331     cudaFree(g_HdataMax);
332     cudaFree(g_HdataUp);
333     cudaFree(g_HdataRec);
334     cudaFree(g_FdataUp);
335     cudaFree(g_directionsUp);
336     cudaFree(g_directionsMax);
337     cudaFree(g_directionsRec);
338 
339     if (U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
340         cudaFree(g_directionsMatrix);
341         cudaFree(g_backtraceBegins);
342     }
343 
344     delete[] tempRow;
345     delete[] directionRow;
346     delete[] zerroArr;
347     delete[] globalMatrix;
348     delete[] backtraceBegins;
349 
350     return pas;
351 }
352 
353 #endif  // SW2_BUILD_WITH_CUDA
354