1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "ImportAnnotationsFromCSVTask.h"
23 
24 #include <QScopedPointer>
25 #include <QScriptValueIterator>
26 
27 #include <U2Core/AddDocumentTask.h>
28 #include <U2Core/AnnotationTableObject.h>
29 #include <U2Core/AppContext.h>
30 #include <U2Core/Counter.h>
31 #include <U2Core/DNASequenceObject.h>
32 #include <U2Core/GObjectRelationRoles.h>
33 #include <U2Core/IOAdapter.h>
34 #include <U2Core/IOAdapterUtils.h>
35 #include <U2Core/L10n.h>
36 #include <U2Core/LoadDocumentTask.h>
37 #include <U2Core/Log.h>
38 #include <U2Core/ProjectModel.h>
39 #include <U2Core/SaveDocumentTask.h>
40 #include <U2Core/ScriptEngine.h>
41 #include <U2Core/ScriptTask.h>
42 #include <U2Core/TextUtils.h>
43 #include <U2Core/U2OpStatusUtils.h>
44 #include <U2Core/U2SafePoints.h>
45 
46 #include <U2Gui/ObjectViewModel.h>
47 
48 #include <U2View/AnnotatedDNAView.h>
49 
50 namespace U2 {
51 
52 QBitArray CSVParsingConfig::QUOTES = TextUtils::createBitMap("\'\"");
53 
ImportAnnotationsFromCSVTask(ImportAnnotationsFromCSVTaskConfig & _config)54 ImportAnnotationsFromCSVTask::ImportAnnotationsFromCSVTask(ImportAnnotationsFromCSVTaskConfig &_config)
55     : Task(tr("Import annotations from CSV"), TaskFlags_NR_FOSCOE),
56       config(_config), readTask(nullptr), writeTask(nullptr), addTask(nullptr) {
57     GCOUNTER(cvar, "ImportAnnotationsFromCSVTask");
58     readTask = new ReadCSVAsAnnotationsTask(config.csvFile, config.parsingOptions);
59     addSubTask(readTask);
60 }
61 
adjustRelations(AnnotationTableObject * ao)62 static void adjustRelations(AnnotationTableObject *ao) {
63     if (!ao->findRelatedObjectsByType(GObjectTypes::SEQUENCE).isEmpty()) {
64         return;  // nothing to adjust -> already has relation
65     }
66 
67     // try automatically associate annotations doc with active sequence view
68     GObjectViewWindow *activeViewWindow = GObjectViewUtils::getActiveObjectViewWindow();
69     if (activeViewWindow == nullptr) {
70         return;
71     }
72     AnnotatedDNAView *seqView = qobject_cast<AnnotatedDNAView *>(activeViewWindow->getObjectView());
73     if (seqView == nullptr) {
74         return;
75     }
76 
77     foreach (U2SequenceObject *seqObj, seqView->getSequenceObjectsWithContexts()) {
78         U2Region seqRegion(0, seqObj->getSequenceLength());
79         bool outOfRange = false;
80         foreach (Annotation *ann, ao->getAnnotations()) {
81             const QVector<U2Region> &locations = ann->getRegions();
82             if (!seqRegion.contains(locations.last())) {
83                 outOfRange = true;
84                 break;
85             }
86         }
87         if (!outOfRange) {
88             ao->addObjectRelation(seqObj, ObjectRole_Sequence);
89             seqView->addObject(ao);
90             break;
91         } else {
92             algoLog.trace(QString("Annotation is out of the sequence range %1").arg(seqObj->getGObjectName()));
93         }
94     }
95 }
96 
onSubTaskFinished(Task * subTask)97 QList<Task *> ImportAnnotationsFromCSVTask::onSubTaskFinished(Task *subTask) {
98     QList<Task *> result;
99     if (hasError() || subTask == addTask) {
100         return result;
101     }
102 
103     GUrl docUrl(config.dstFile);
104     Document *projDoc = AppContext::getProject()->findDocumentByURL(docUrl);
105     bool inProject = projDoc != nullptr;
106 
107     if (doc.isNull() && projDoc != nullptr) {
108         doc = projDoc;
109     }
110     if (doc.isNull()) {  // document is null -> save it and add to the project
111         assert(subTask == readTask);
112         doc = prepareNewDocument(prepareAnnotations());
113         writeTask = new SaveDocumentTask(doc);
114         result.append(writeTask);
115     } else if (writeTask != nullptr && !inProject) {  // document was saved -> add to the project
116         addTask = new AddDocumentTask(doc);
117         result.append(addTask);
118     } else {  // document already in the project -> check loaded state and add annotations to it
119         assert(inProject);
120         if (!doc->isLoaded()) {
121             result.append(new LoadUnloadedDocumentTask(doc));
122         } else {
123             DocumentFormatConstraints dfc;
124             dfc.flagsToSupport = DocumentFormatFlag_SupportWriting;
125             dfc.supportedObjectTypes += GObjectTypes::ANNOTATION_TABLE;
126             if (!doc->getDocumentFormat()->checkConstraints(dfc)) {
127                 setError(tr("Annotations can't be added to the document %1").arg(doc->getURLString()));
128                 return result;
129             }
130             if (doc->isStateLocked()) {
131                 setError(tr("Document is locked and can't be modified %1").arg(doc->getURLString()));
132                 return result;
133             }
134             QList<GObject *> objs = doc->findGObjectByType(GObjectTypes::ANNOTATION_TABLE);
135             AnnotationTableObject *ao = objs.isEmpty() ? nullptr : qobject_cast<AnnotationTableObject *>(objs.first());
136             if (ao == nullptr) {
137                 ao = new AnnotationTableObject("Annotations", doc->getDbiRef());
138                 adjustRelations(ao);
139             }
140             SAFE_POINT(ao != nullptr, "Invalid annotation table", result);
141             QMap<QString, QList<SharedAnnotationData>> groups = prepareAnnotations();
142             foreach (const QString &groupName, groups.keys()) {
143                 ao->addAnnotations(groups[groupName], groupName);
144             }
145         }
146     }
147     return result;
148 }
149 
prepareAnnotations() const150 QMap<QString, QList<SharedAnnotationData>> ImportAnnotationsFromCSVTask::prepareAnnotations() const {
151     QMap<QString, QList<SharedAnnotationData>> result;
152 
153     SAFE_POINT(readTask != nullptr && readTask->isFinished(), "Invalid read annotations task!", result);
154     QMap<QString, QList<SharedAnnotationData>> datas = readTask->getResult();
155     foreach (const QString &groupName, datas.keys()) {
156         foreach (const SharedAnnotationData &d, datas[groupName]) {
157             result[groupName] << d;
158         }
159     }
160     return result;
161 }
162 
prepareNewDocument(const QMap<QString,QList<SharedAnnotationData>> & groups)163 Document *ImportAnnotationsFromCSVTask::prepareNewDocument(const QMap<QString, QList<SharedAnnotationData>> &groups) {
164     DocumentFormat *format = AppContext::getDocumentFormatRegistry()->getFormatById(config.formatId);
165     CHECK(nullptr != format, nullptr);
166 
167     IOAdapterId ioId = IOAdapterUtils::url2io(config.dstFile);
168     IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(ioId);
169 
170     U2OpStatus2Log os;
171     Document *result = format->createNewLoadedDocument(iof, config.dstFile, os);
172     CHECK_OP(os, nullptr);
173 
174     AnnotationTableObject *ao = new AnnotationTableObject("Annotations", result->getDbiRef());
175     foreach (const QString &groupName, groups.keys()) {
176         ao->addAnnotations(groups[groupName], groupName);
177     }
178     ao->setModified(false);
179     result->addObject(ao);
180 
181     adjustRelations(ao);
182 
183     return result;
184 }
185 
186 //////////////////////////////////////////////////////////////////////////
187 // ReadCSVAsAnnotationsTask
188 
ReadCSVAsAnnotationsTask(const QString & _file,const CSVParsingConfig & _config)189 ReadCSVAsAnnotationsTask::ReadCSVAsAnnotationsTask(const QString &_file, const CSVParsingConfig &_config)
190     : Task(tr("Parse CSV file %1").arg(_file), TaskFlag_None), file(_file), config(_config) {
191 }
192 
193 #define BUFF_SIZE 8192
run()194 void ReadCSVAsAnnotationsTask::run() {
195     GUrl url(file);
196     IOAdapterId ioId = IOAdapterUtils::url2io(url);
197     IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(ioId);
198     QScopedPointer<IOAdapter> io(iof->createIOAdapter());
199 
200     if (!io->open(url, IOAdapterMode_Read)) {
201         setError(L10N::errorOpeningFileRead(url));
202         return;
203     }
204 
205     QByteArray block(BUFF_SIZE, '\0');
206     int blockLen = 0;
207     QString text;
208     while ((blockLen = io->readBlock(block.data(), BUFF_SIZE)) > 0) {
209         int sizeBefore = text.length();
210         QString line = QString::fromLocal8Bit(block.data(), blockLen);
211         text.append(line);
212         if (text.length() != sizeBefore + line.length()) {
213             setError(L10N::errorReadingFile(url));
214         }
215         stateInfo.progress = io->getProgress();
216     }
217     int maxColumns = 0;
218     QList<QStringList> parsedLines = parseLinesIntoTokens(text, config, maxColumns, stateInfo);
219 
220     foreach (const QStringList &lineTokens, parsedLines) {
221         SharedAnnotationData a(new AnnotationData);
222         bool ok = true;
223         QString error;
224         int startPos = -1;
225         int startPosOffset = 0;
226         int len = -1;
227         int endPos = -1;
228         QString groupName;
229 
230         for (int column = 0; column < lineTokens.size() && ok; column++) {
231             if (column >= config.columns.size()) {
232                 break;
233             }
234             const ColumnConfig &columnConf = config.columns.at(column);
235             const QString &token = lineTokens.at(column);
236             switch (columnConf.role) {
237                 case ColumnRole_Qualifier:
238                     assert(!columnConf.qualifierName.isEmpty());
239                     a->qualifiers.append(U2Qualifier(columnConf.qualifierName, token));
240                     break;
241                 case ColumnRole_Name:
242                     a->name = token.isEmpty() ? config.defaultAnnotationName : token;
243                     ok = Annotation::isValidAnnotationName(a->name);
244                     if (!ok) {
245                         error = tr("Invalid annotation name: '%1'").arg(a->name);
246                     }
247                     break;
248                 case ColumnRole_StartPos:
249                     assert(startPos == -1);
250                     startPos = token.toInt(&ok) - 1;
251                     startPosOffset = columnConf.startPositionOffset;
252                     if (!ok) {
253                         error = tr("Start offset is not numeric: '%1'").arg(token);
254                     }
255                     break;
256                 case ColumnRole_EndPos:
257                     assert(endPos == -1);
258                     endPos = token.toInt(&ok) + (columnConf.endPositionIsInclusive ? 1 : 0) - 1;
259                     if (!ok) {
260                         error = tr("End offset is not numeric: '%1'").arg(token);
261                     }
262                     break;
263                 case ColumnRole_Length:
264                     assert(len == -1);
265                     len = token.toInt(&ok);
266                     if (!ok) {
267                         error = tr("Length is not numeric: '%1'").arg(token);
268                     }
269                     break;
270                 case ColumnRole_ComplMark:
271                     a->location->strand = (columnConf.complementMark.isEmpty() || token == columnConf.complementMark) ? U2Strand::Complementary : U2Strand::Direct;
272                     break;
273                 case ColumnRole_Group:
274                     groupName = token;
275                     break;
276                 default:
277                     assert(columnConf.role == ColumnRole_Ignore);
278             }
279         }
280 
281         // add annotation
282         if (ok) {
283             // set up default name
284             if (a->name.isEmpty()) {
285                 a->name = config.defaultAnnotationName;
286             }
287             // set up location
288             U2Region location;
289             if (startPos != -1) {
290                 location.startPos = startPos + startPosOffset;
291                 if (endPos != -1) {
292                     location.length = endPos - startPos;
293                 } else {
294                     location.length = len;
295                 }
296             } else {
297                 location.length = len;
298                 location.startPos = endPos - len;
299             }
300             if (location.length < 0) {
301                 location.startPos = location.startPos + location.length;
302                 location.length = -location.length;
303             }
304             if (location.startPos < 0 || location.startPos > location.endPos()) {
305                 algoLog.details(tr("Invalid location: start: %1  len: %2, in line :%3, ignoring")
306                                     .arg(QString::number(location.startPos))
307                                     .arg(QString::number(location.length))
308                                     .arg(lineTokens.join(config.splitToken)));
309             } else {
310                 a->location->regions.append(location);
311                 result[groupName] << a;
312             }
313         } else {
314             // TODO: make configurable to allow stop parsing on any error!
315             algoLog.details(tr("Can't parse line: '%1', error = %2, ignoring").arg(lineTokens.join(config.splitToken)).arg(error));
316         }
317     }
318 }
319 
removeQuotes(const QStringList & tokens)320 static QStringList removeQuotes(const QStringList &tokens) {
321     QStringList result;
322     foreach (const QString &token, tokens) {
323         if (token.length() < 2) {
324             result.append(token);
325             continue;
326         }
327         char c1 = token.at(0).toLatin1();
328         char c2 = token.at(token.length() - 1).toLatin1();
329         if (c1 != c2 || !CSVParsingConfig::QUOTES.testBit(quint8(c1))) {
330             result.append(token);
331             continue;
332         }
333         QString unquoted = token.mid(1, token.length() - 2);
334         result.append(unquoted);
335     }
336     return result;
337 }
338 
parseLinesIntoTokens(const QString & text,const CSVParsingConfig & config,int & maxColumns,TaskStateInfo & ti)339 QList<QStringList> ReadCSVAsAnnotationsTask::parseLinesIntoTokens(const QString &text, const CSVParsingConfig &config, int &maxColumns, TaskStateInfo &ti) {
340     QList<QStringList> result;
341     assert(!config.splitToken.isEmpty() || !config.parsingScript.isEmpty());
342     maxColumns = 0;
343     QStringList lines = text.split('\n', QString::SkipEmptyParts);
344     int lineNum = 1;
345     for (int l = 0; l < lines.size(); l++) {
346         if (l < config.linesToSkip) {
347             continue;
348         }
349         QString line = lines.at(l).trimmed();
350         if (!config.prefixToSkip.isEmpty() && line.startsWith(config.prefixToSkip)) {
351             continue;
352         }
353 
354         QStringList tokens = parseLineIntoTokens(line, config, ti, lineNum);
355         if (config.removeQuotes) {
356             tokens = removeQuotes(tokens);
357         }
358         lineNum++;
359         maxColumns = qMax(maxColumns, tokens.size());
360         result.append(tokens);
361     }
362     return result;
363 }
364 
365 QString ReadCSVAsAnnotationsTask::LINE_VAR("line");
366 QString ReadCSVAsAnnotationsTask::LINE_NUM_VAR("lineNum");
367 
parseLineIntoTokens(const QString & line,const CSVParsingConfig & config,TaskStateInfo & ti,int lineNum)368 QStringList ReadCSVAsAnnotationsTask::parseLineIntoTokens(const QString &line, const CSVParsingConfig &config, TaskStateInfo &ti, int lineNum) {
369     QStringList result;
370     if (config.parsingScript.isEmpty()) {
371         result = line.split(config.splitToken, config.keepEmptyParts ? QString::KeepEmptyParts : QString::SkipEmptyParts);
372         return result;
373     }
374     // run script
375     QMap<QString, QScriptValue> vars;
376     QScriptEngine engine;
377     vars[LINE_VAR] = QScriptValue(&engine, line);
378     vars[LINE_NUM_VAR] = QScriptValue(&engine, lineNum);
379     QScriptValue scriptResult = ScriptTask::runScript(&engine, vars, config.parsingScript, ti);
380     if (ti.cancelFlag || ti.hasError()) {
381         return result;
382     }
383     if (scriptResult.isString()) {
384         result.append(scriptResult.toString());
385     } else if (scriptResult.isArray()) {
386         QScriptValueIterator it(scriptResult);
387         while (it.hasNext()) {
388             it.next();
389             if (it.flags() & QScriptValue::SkipInEnumeration)
390                 continue;
391             QScriptValue val = it.value();
392             QString strVal = val.toString();
393             result.append(strVal);
394         }
395     } else {
396         ti.setError(tr("Script result is not an array of strings!"));
397     }
398     return result;
399 }
400 
401 class CharStat {
402 public:
CharStat()403     CharStat()
404         : ch(0), count(0) {
405     }
406     char ch;
407     int count;
408 };
409 
countFreqs(const QString & line)410 static QVector<CharStat> countFreqs(const QString &line) {
411     QVector<CharStat> result(256);
412     QByteArray ba = line.toLocal8Bit();
413     const char *data = ba.constData();
414     char prevChar = 0;
415     for (int i = 0, n = ba.length(); i < n; i++) {
416         char c = data[i];
417 
418         if (c == prevChar && (c == ' ' || c == '\t')) {  // do not count repeating ws
419             continue;
420         }
421         result[uchar(c)].ch = c;
422         result[uchar(c)].count++;
423         prevChar = c;
424     }
425     return result;
426 }
427 
mergeFreqs(QVector<CharStat> & globalFreqs,const QVector<CharStat> & localFreqs)428 static void mergeFreqs(QVector<CharStat> &globalFreqs, const QVector<CharStat> &localFreqs) {
429     assert(globalFreqs.size() == localFreqs.size());
430     for (int i = 0, n = globalFreqs.size(); i < n; i++) {
431         if (globalFreqs.at(i).count != localFreqs.at(i).count) {
432             globalFreqs[i].count = 0;
433         }
434     }
435 }
436 
guessSeparatorString(const QString & text,const CSVParsingConfig & config)437 QString ReadCSVAsAnnotationsTask::guessSeparatorString(const QString &text, const CSVParsingConfig &config) {
438     QVector<CharStat> globalFreqs;
439     QStringList lines = text.split('\n', QString::SkipEmptyParts);
440     for (int l = 0; l < lines.size(); l++) {
441         if (l < config.linesToSkip) {
442             continue;
443         }
444         QString line = lines.at(l).trimmed();
445         QVector<CharStat> lineFreqs = countFreqs(line);
446         if (globalFreqs.isEmpty()) {
447             globalFreqs = lineFreqs;
448             continue;
449         }
450         if (!config.prefixToSkip.isEmpty() && line.startsWith(config.prefixToSkip)) {
451             continue;
452         }
453         mergeFreqs(globalFreqs, lineFreqs);
454     }
455     CharStat max;
456     float maxWeight = 0;
457     static QString doubleWeightChars = ",;: \t";  // chars that are often used as separators
458     static QString lowWeightChars = "\'\"";  // quotes and other frequent chars that rare used as separators
459     for (int i = 0; i < globalFreqs.size(); i++) {
460         const CharStat &cs = globalFreqs.at(i);
461         float csWeight = cs.count;
462         if (doubleWeightChars.contains(cs.ch)) {
463             csWeight = csWeight * 2;
464         } else if (lowWeightChars.contains(cs.ch)) {
465             csWeight = csWeight / 2;
466         }
467         if (csWeight > maxWeight) {
468             max = cs;
469             maxWeight = csWeight;
470         }
471     }
472     if (max.count == 0) {
473         return QString();
474     }
475     return QString(QChar(max.ch));
476 }
477 
478 }  // namespace U2
479