1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "ImportAnnotationsFromCSVTask.h"
23
24 #include <QScopedPointer>
25 #include <QScriptValueIterator>
26
27 #include <U2Core/AddDocumentTask.h>
28 #include <U2Core/AnnotationTableObject.h>
29 #include <U2Core/AppContext.h>
30 #include <U2Core/Counter.h>
31 #include <U2Core/DNASequenceObject.h>
32 #include <U2Core/GObjectRelationRoles.h>
33 #include <U2Core/IOAdapter.h>
34 #include <U2Core/IOAdapterUtils.h>
35 #include <U2Core/L10n.h>
36 #include <U2Core/LoadDocumentTask.h>
37 #include <U2Core/Log.h>
38 #include <U2Core/ProjectModel.h>
39 #include <U2Core/SaveDocumentTask.h>
40 #include <U2Core/ScriptEngine.h>
41 #include <U2Core/ScriptTask.h>
42 #include <U2Core/TextUtils.h>
43 #include <U2Core/U2OpStatusUtils.h>
44 #include <U2Core/U2SafePoints.h>
45
46 #include <U2Gui/ObjectViewModel.h>
47
48 #include <U2View/AnnotatedDNAView.h>
49
50 namespace U2 {
51
52 QBitArray CSVParsingConfig::QUOTES = TextUtils::createBitMap("\'\"");
53
ImportAnnotationsFromCSVTask(ImportAnnotationsFromCSVTaskConfig & _config)54 ImportAnnotationsFromCSVTask::ImportAnnotationsFromCSVTask(ImportAnnotationsFromCSVTaskConfig &_config)
55 : Task(tr("Import annotations from CSV"), TaskFlags_NR_FOSCOE),
56 config(_config), readTask(nullptr), writeTask(nullptr), addTask(nullptr) {
57 GCOUNTER(cvar, "ImportAnnotationsFromCSVTask");
58 readTask = new ReadCSVAsAnnotationsTask(config.csvFile, config.parsingOptions);
59 addSubTask(readTask);
60 }
61
adjustRelations(AnnotationTableObject * ao)62 static void adjustRelations(AnnotationTableObject *ao) {
63 if (!ao->findRelatedObjectsByType(GObjectTypes::SEQUENCE).isEmpty()) {
64 return; // nothing to adjust -> already has relation
65 }
66
67 // try automatically associate annotations doc with active sequence view
68 GObjectViewWindow *activeViewWindow = GObjectViewUtils::getActiveObjectViewWindow();
69 if (activeViewWindow == nullptr) {
70 return;
71 }
72 AnnotatedDNAView *seqView = qobject_cast<AnnotatedDNAView *>(activeViewWindow->getObjectView());
73 if (seqView == nullptr) {
74 return;
75 }
76
77 foreach (U2SequenceObject *seqObj, seqView->getSequenceObjectsWithContexts()) {
78 U2Region seqRegion(0, seqObj->getSequenceLength());
79 bool outOfRange = false;
80 foreach (Annotation *ann, ao->getAnnotations()) {
81 const QVector<U2Region> &locations = ann->getRegions();
82 if (!seqRegion.contains(locations.last())) {
83 outOfRange = true;
84 break;
85 }
86 }
87 if (!outOfRange) {
88 ao->addObjectRelation(seqObj, ObjectRole_Sequence);
89 seqView->addObject(ao);
90 break;
91 } else {
92 algoLog.trace(QString("Annotation is out of the sequence range %1").arg(seqObj->getGObjectName()));
93 }
94 }
95 }
96
onSubTaskFinished(Task * subTask)97 QList<Task *> ImportAnnotationsFromCSVTask::onSubTaskFinished(Task *subTask) {
98 QList<Task *> result;
99 if (hasError() || subTask == addTask) {
100 return result;
101 }
102
103 GUrl docUrl(config.dstFile);
104 Document *projDoc = AppContext::getProject()->findDocumentByURL(docUrl);
105 bool inProject = projDoc != nullptr;
106
107 if (doc.isNull() && projDoc != nullptr) {
108 doc = projDoc;
109 }
110 if (doc.isNull()) { // document is null -> save it and add to the project
111 assert(subTask == readTask);
112 doc = prepareNewDocument(prepareAnnotations());
113 writeTask = new SaveDocumentTask(doc);
114 result.append(writeTask);
115 } else if (writeTask != nullptr && !inProject) { // document was saved -> add to the project
116 addTask = new AddDocumentTask(doc);
117 result.append(addTask);
118 } else { // document already in the project -> check loaded state and add annotations to it
119 assert(inProject);
120 if (!doc->isLoaded()) {
121 result.append(new LoadUnloadedDocumentTask(doc));
122 } else {
123 DocumentFormatConstraints dfc;
124 dfc.flagsToSupport = DocumentFormatFlag_SupportWriting;
125 dfc.supportedObjectTypes += GObjectTypes::ANNOTATION_TABLE;
126 if (!doc->getDocumentFormat()->checkConstraints(dfc)) {
127 setError(tr("Annotations can't be added to the document %1").arg(doc->getURLString()));
128 return result;
129 }
130 if (doc->isStateLocked()) {
131 setError(tr("Document is locked and can't be modified %1").arg(doc->getURLString()));
132 return result;
133 }
134 QList<GObject *> objs = doc->findGObjectByType(GObjectTypes::ANNOTATION_TABLE);
135 AnnotationTableObject *ao = objs.isEmpty() ? nullptr : qobject_cast<AnnotationTableObject *>(objs.first());
136 if (ao == nullptr) {
137 ao = new AnnotationTableObject("Annotations", doc->getDbiRef());
138 adjustRelations(ao);
139 }
140 SAFE_POINT(ao != nullptr, "Invalid annotation table", result);
141 QMap<QString, QList<SharedAnnotationData>> groups = prepareAnnotations();
142 foreach (const QString &groupName, groups.keys()) {
143 ao->addAnnotations(groups[groupName], groupName);
144 }
145 }
146 }
147 return result;
148 }
149
prepareAnnotations() const150 QMap<QString, QList<SharedAnnotationData>> ImportAnnotationsFromCSVTask::prepareAnnotations() const {
151 QMap<QString, QList<SharedAnnotationData>> result;
152
153 SAFE_POINT(readTask != nullptr && readTask->isFinished(), "Invalid read annotations task!", result);
154 QMap<QString, QList<SharedAnnotationData>> datas = readTask->getResult();
155 foreach (const QString &groupName, datas.keys()) {
156 foreach (const SharedAnnotationData &d, datas[groupName]) {
157 result[groupName] << d;
158 }
159 }
160 return result;
161 }
162
prepareNewDocument(const QMap<QString,QList<SharedAnnotationData>> & groups)163 Document *ImportAnnotationsFromCSVTask::prepareNewDocument(const QMap<QString, QList<SharedAnnotationData>> &groups) {
164 DocumentFormat *format = AppContext::getDocumentFormatRegistry()->getFormatById(config.formatId);
165 CHECK(nullptr != format, nullptr);
166
167 IOAdapterId ioId = IOAdapterUtils::url2io(config.dstFile);
168 IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(ioId);
169
170 U2OpStatus2Log os;
171 Document *result = format->createNewLoadedDocument(iof, config.dstFile, os);
172 CHECK_OP(os, nullptr);
173
174 AnnotationTableObject *ao = new AnnotationTableObject("Annotations", result->getDbiRef());
175 foreach (const QString &groupName, groups.keys()) {
176 ao->addAnnotations(groups[groupName], groupName);
177 }
178 ao->setModified(false);
179 result->addObject(ao);
180
181 adjustRelations(ao);
182
183 return result;
184 }
185
186 //////////////////////////////////////////////////////////////////////////
187 // ReadCSVAsAnnotationsTask
188
ReadCSVAsAnnotationsTask(const QString & _file,const CSVParsingConfig & _config)189 ReadCSVAsAnnotationsTask::ReadCSVAsAnnotationsTask(const QString &_file, const CSVParsingConfig &_config)
190 : Task(tr("Parse CSV file %1").arg(_file), TaskFlag_None), file(_file), config(_config) {
191 }
192
193 #define BUFF_SIZE 8192
run()194 void ReadCSVAsAnnotationsTask::run() {
195 GUrl url(file);
196 IOAdapterId ioId = IOAdapterUtils::url2io(url);
197 IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(ioId);
198 QScopedPointer<IOAdapter> io(iof->createIOAdapter());
199
200 if (!io->open(url, IOAdapterMode_Read)) {
201 setError(L10N::errorOpeningFileRead(url));
202 return;
203 }
204
205 QByteArray block(BUFF_SIZE, '\0');
206 int blockLen = 0;
207 QString text;
208 while ((blockLen = io->readBlock(block.data(), BUFF_SIZE)) > 0) {
209 int sizeBefore = text.length();
210 QString line = QString::fromLocal8Bit(block.data(), blockLen);
211 text.append(line);
212 if (text.length() != sizeBefore + line.length()) {
213 setError(L10N::errorReadingFile(url));
214 }
215 stateInfo.progress = io->getProgress();
216 }
217 int maxColumns = 0;
218 QList<QStringList> parsedLines = parseLinesIntoTokens(text, config, maxColumns, stateInfo);
219
220 foreach (const QStringList &lineTokens, parsedLines) {
221 SharedAnnotationData a(new AnnotationData);
222 bool ok = true;
223 QString error;
224 int startPos = -1;
225 int startPosOffset = 0;
226 int len = -1;
227 int endPos = -1;
228 QString groupName;
229
230 for (int column = 0; column < lineTokens.size() && ok; column++) {
231 if (column >= config.columns.size()) {
232 break;
233 }
234 const ColumnConfig &columnConf = config.columns.at(column);
235 const QString &token = lineTokens.at(column);
236 switch (columnConf.role) {
237 case ColumnRole_Qualifier:
238 assert(!columnConf.qualifierName.isEmpty());
239 a->qualifiers.append(U2Qualifier(columnConf.qualifierName, token));
240 break;
241 case ColumnRole_Name:
242 a->name = token.isEmpty() ? config.defaultAnnotationName : token;
243 ok = Annotation::isValidAnnotationName(a->name);
244 if (!ok) {
245 error = tr("Invalid annotation name: '%1'").arg(a->name);
246 }
247 break;
248 case ColumnRole_StartPos:
249 assert(startPos == -1);
250 startPos = token.toInt(&ok) - 1;
251 startPosOffset = columnConf.startPositionOffset;
252 if (!ok) {
253 error = tr("Start offset is not numeric: '%1'").arg(token);
254 }
255 break;
256 case ColumnRole_EndPos:
257 assert(endPos == -1);
258 endPos = token.toInt(&ok) + (columnConf.endPositionIsInclusive ? 1 : 0) - 1;
259 if (!ok) {
260 error = tr("End offset is not numeric: '%1'").arg(token);
261 }
262 break;
263 case ColumnRole_Length:
264 assert(len == -1);
265 len = token.toInt(&ok);
266 if (!ok) {
267 error = tr("Length is not numeric: '%1'").arg(token);
268 }
269 break;
270 case ColumnRole_ComplMark:
271 a->location->strand = (columnConf.complementMark.isEmpty() || token == columnConf.complementMark) ? U2Strand::Complementary : U2Strand::Direct;
272 break;
273 case ColumnRole_Group:
274 groupName = token;
275 break;
276 default:
277 assert(columnConf.role == ColumnRole_Ignore);
278 }
279 }
280
281 // add annotation
282 if (ok) {
283 // set up default name
284 if (a->name.isEmpty()) {
285 a->name = config.defaultAnnotationName;
286 }
287 // set up location
288 U2Region location;
289 if (startPos != -1) {
290 location.startPos = startPos + startPosOffset;
291 if (endPos != -1) {
292 location.length = endPos - startPos;
293 } else {
294 location.length = len;
295 }
296 } else {
297 location.length = len;
298 location.startPos = endPos - len;
299 }
300 if (location.length < 0) {
301 location.startPos = location.startPos + location.length;
302 location.length = -location.length;
303 }
304 if (location.startPos < 0 || location.startPos > location.endPos()) {
305 algoLog.details(tr("Invalid location: start: %1 len: %2, in line :%3, ignoring")
306 .arg(QString::number(location.startPos))
307 .arg(QString::number(location.length))
308 .arg(lineTokens.join(config.splitToken)));
309 } else {
310 a->location->regions.append(location);
311 result[groupName] << a;
312 }
313 } else {
314 // TODO: make configurable to allow stop parsing on any error!
315 algoLog.details(tr("Can't parse line: '%1', error = %2, ignoring").arg(lineTokens.join(config.splitToken)).arg(error));
316 }
317 }
318 }
319
removeQuotes(const QStringList & tokens)320 static QStringList removeQuotes(const QStringList &tokens) {
321 QStringList result;
322 foreach (const QString &token, tokens) {
323 if (token.length() < 2) {
324 result.append(token);
325 continue;
326 }
327 char c1 = token.at(0).toLatin1();
328 char c2 = token.at(token.length() - 1).toLatin1();
329 if (c1 != c2 || !CSVParsingConfig::QUOTES.testBit(quint8(c1))) {
330 result.append(token);
331 continue;
332 }
333 QString unquoted = token.mid(1, token.length() - 2);
334 result.append(unquoted);
335 }
336 return result;
337 }
338
parseLinesIntoTokens(const QString & text,const CSVParsingConfig & config,int & maxColumns,TaskStateInfo & ti)339 QList<QStringList> ReadCSVAsAnnotationsTask::parseLinesIntoTokens(const QString &text, const CSVParsingConfig &config, int &maxColumns, TaskStateInfo &ti) {
340 QList<QStringList> result;
341 assert(!config.splitToken.isEmpty() || !config.parsingScript.isEmpty());
342 maxColumns = 0;
343 QStringList lines = text.split('\n', QString::SkipEmptyParts);
344 int lineNum = 1;
345 for (int l = 0; l < lines.size(); l++) {
346 if (l < config.linesToSkip) {
347 continue;
348 }
349 QString line = lines.at(l).trimmed();
350 if (!config.prefixToSkip.isEmpty() && line.startsWith(config.prefixToSkip)) {
351 continue;
352 }
353
354 QStringList tokens = parseLineIntoTokens(line, config, ti, lineNum);
355 if (config.removeQuotes) {
356 tokens = removeQuotes(tokens);
357 }
358 lineNum++;
359 maxColumns = qMax(maxColumns, tokens.size());
360 result.append(tokens);
361 }
362 return result;
363 }
364
365 QString ReadCSVAsAnnotationsTask::LINE_VAR("line");
366 QString ReadCSVAsAnnotationsTask::LINE_NUM_VAR("lineNum");
367
parseLineIntoTokens(const QString & line,const CSVParsingConfig & config,TaskStateInfo & ti,int lineNum)368 QStringList ReadCSVAsAnnotationsTask::parseLineIntoTokens(const QString &line, const CSVParsingConfig &config, TaskStateInfo &ti, int lineNum) {
369 QStringList result;
370 if (config.parsingScript.isEmpty()) {
371 result = line.split(config.splitToken, config.keepEmptyParts ? QString::KeepEmptyParts : QString::SkipEmptyParts);
372 return result;
373 }
374 // run script
375 QMap<QString, QScriptValue> vars;
376 QScriptEngine engine;
377 vars[LINE_VAR] = QScriptValue(&engine, line);
378 vars[LINE_NUM_VAR] = QScriptValue(&engine, lineNum);
379 QScriptValue scriptResult = ScriptTask::runScript(&engine, vars, config.parsingScript, ti);
380 if (ti.cancelFlag || ti.hasError()) {
381 return result;
382 }
383 if (scriptResult.isString()) {
384 result.append(scriptResult.toString());
385 } else if (scriptResult.isArray()) {
386 QScriptValueIterator it(scriptResult);
387 while (it.hasNext()) {
388 it.next();
389 if (it.flags() & QScriptValue::SkipInEnumeration)
390 continue;
391 QScriptValue val = it.value();
392 QString strVal = val.toString();
393 result.append(strVal);
394 }
395 } else {
396 ti.setError(tr("Script result is not an array of strings!"));
397 }
398 return result;
399 }
400
401 class CharStat {
402 public:
CharStat()403 CharStat()
404 : ch(0), count(0) {
405 }
406 char ch;
407 int count;
408 };
409
countFreqs(const QString & line)410 static QVector<CharStat> countFreqs(const QString &line) {
411 QVector<CharStat> result(256);
412 QByteArray ba = line.toLocal8Bit();
413 const char *data = ba.constData();
414 char prevChar = 0;
415 for (int i = 0, n = ba.length(); i < n; i++) {
416 char c = data[i];
417
418 if (c == prevChar && (c == ' ' || c == '\t')) { // do not count repeating ws
419 continue;
420 }
421 result[uchar(c)].ch = c;
422 result[uchar(c)].count++;
423 prevChar = c;
424 }
425 return result;
426 }
427
mergeFreqs(QVector<CharStat> & globalFreqs,const QVector<CharStat> & localFreqs)428 static void mergeFreqs(QVector<CharStat> &globalFreqs, const QVector<CharStat> &localFreqs) {
429 assert(globalFreqs.size() == localFreqs.size());
430 for (int i = 0, n = globalFreqs.size(); i < n; i++) {
431 if (globalFreqs.at(i).count != localFreqs.at(i).count) {
432 globalFreqs[i].count = 0;
433 }
434 }
435 }
436
guessSeparatorString(const QString & text,const CSVParsingConfig & config)437 QString ReadCSVAsAnnotationsTask::guessSeparatorString(const QString &text, const CSVParsingConfig &config) {
438 QVector<CharStat> globalFreqs;
439 QStringList lines = text.split('\n', QString::SkipEmptyParts);
440 for (int l = 0; l < lines.size(); l++) {
441 if (l < config.linesToSkip) {
442 continue;
443 }
444 QString line = lines.at(l).trimmed();
445 QVector<CharStat> lineFreqs = countFreqs(line);
446 if (globalFreqs.isEmpty()) {
447 globalFreqs = lineFreqs;
448 continue;
449 }
450 if (!config.prefixToSkip.isEmpty() && line.startsWith(config.prefixToSkip)) {
451 continue;
452 }
453 mergeFreqs(globalFreqs, lineFreqs);
454 }
455 CharStat max;
456 float maxWeight = 0;
457 static QString doubleWeightChars = ",;: \t"; // chars that are often used as separators
458 static QString lowWeightChars = "\'\""; // quotes and other frequent chars that rare used as separators
459 for (int i = 0; i < globalFreqs.size(); i++) {
460 const CharStat &cs = globalFreqs.at(i);
461 float csWeight = cs.count;
462 if (doubleWeightChars.contains(cs.ch)) {
463 csWeight = csWeight * 2;
464 } else if (lowWeightChars.contains(cs.ch)) {
465 csWeight = csWeight / 2;
466 }
467 if (csWeight > maxWeight) {
468 max = cs;
469 maxWeight = csWeight;
470 }
471 }
472 if (max.count == 0) {
473 return QString();
474 }
475 return QString(QChar(max.ch));
476 }
477
478 } // namespace U2
479