1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "DifferentialFormat.h"
23 
24 #include <U2Core/AnnotationTableObject.h>
25 #include <U2Core/BaseDocumentFormats.h>
26 #include <U2Core/IOAdapter.h>
27 #include <U2Core/IOAdapterTextStream.h>
28 #include <U2Core/U2DbiUtils.h>
29 #include <U2Core/U2ObjectDbi.h>
30 #include <U2Core/U2OpStatusUtils.h>
31 #include <U2Core/U2SafePoints.h>
32 
33 namespace U2 {
34 
35 static const QString SEPARATOR("\t");
36 static const QString LOCUS_COLUMN("locus");
37 static const QString LOCUS_SEP1(":");
38 static const QString LOCUS_SEP2("-");
39 static const QString CHROMOSOME("chromosome");
40 static const QString UNKNOWN_CHR("unknown");
41 
DifferentialFormat(QObject * parent)42 DifferentialFormat::DifferentialFormat(QObject *parent)
43     : TextDocumentFormat(parent, BaseDocumentFormats::DIFF, DocumentFormatFlags_W1, {"diff"}) {
44     formatName = tr("Differential");
45     supportedObjectTypes += GObjectTypes::ANNOTATION_TABLE;
46     formatDescription = tr("Differential format is a text-based format for"
47                            " representing Cuffdiff differential output files: expression,"
48                            " splicing, promoters and cds.");
49 }
50 
getColumns()51 QList<ColumnDataParser::Column> DifferentialFormat::getColumns() {
52     return {
53         ColumnDataParser::Column("test_id", ColumnDataParser::STRING, "-", true /*required*/),
54         ColumnDataParser::Column("gene_id", ColumnDataParser::STRING, "-"),
55         ColumnDataParser::Column("gene", ColumnDataParser::STRING, "-"),
56         ColumnDataParser::Column(LOCUS_COLUMN, ColumnDataParser::STRING, "-", true /*required*/),
57         ColumnDataParser::Column("sample_1", ColumnDataParser::STRING, "-", true /*required*/),
58         ColumnDataParser::Column("sample_2", ColumnDataParser::STRING, "-", true /*required*/),
59         ColumnDataParser::Column("status", ColumnDataParser::STRING, "-", true /*required*/),
60         ColumnDataParser::Column("value_1", ColumnDataParser::DOUBLE, "1"),
61         ColumnDataParser::Column("value_2", ColumnDataParser::DOUBLE, "1"),
62         ColumnDataParser::Column("log2(fold_change)", ColumnDataParser::DOUBLE, "0"),
63         ColumnDataParser::Column("sqrt(JS)", ColumnDataParser::DOUBLE, "0"),
64         ColumnDataParser::Column("test_stat", ColumnDataParser::DOUBLE, "0"),
65         ColumnDataParser::Column("p_value", ColumnDataParser::DOUBLE, "1"),
66         ColumnDataParser::Column("q_value", ColumnDataParser::DOUBLE, "1"),
67         ColumnDataParser::Column("significant", ColumnDataParser::STRING, "-", true /*required*/)};
68 }
69 
getAnnotationName()70 QString DifferentialFormat::getAnnotationName() {
71     return "differential";
72 }
73 
checkRawTextData(const QString & dataPrefix,const GUrl &) const74 FormatCheckResult DifferentialFormat::checkRawTextData(const QString &dataPrefix, const GUrl &) const {
75     QStringList lines = dataPrefix.split("\n", QString::SkipEmptyParts);
76     CHECK(!lines.isEmpty(), FormatDetection_NotMatched);
77 
78     ColumnDataParser parser(getColumns(), SEPARATOR);
79     U2OpStatusImpl os;
80     parser.init(lines.takeFirst(), os);
81     CHECK_OP(os, FormatDetection_NotMatched);
82     CHECK(parser.getCurrentColumns().size() > 1, FormatDetection_NotMatched);
83 
84     // Check all lines. Skip the last line because it can be cut.
85     for (int i = 0; i < lines.length() - 1; i++) {
86         const QString &line = lines[i];
87         ColumnDataParser::Iterator values = parser.parseLine(line, os);
88         CHECK_OP(os, FormatDetection_NotMatched);
89         bool containsLocus = false;
90         for (; !values.isEnded(); values.takeString()) {
91             if (values.currentName() == LOCUS_COLUMN) {
92                 containsLocus = true;
93             }
94         }
95         CHECK(containsLocus, FormatDetection_NotMatched);
96     }
97 
98     return FormatDetection_Matched;
99 }
100 
parseLocus(const QString & locus,SharedAnnotationData & data,U2OpStatus & os)101 bool DifferentialFormat::parseLocus(const QString &locus, SharedAnnotationData &data, U2OpStatus &os) {
102     // locus == chr_name:start_pos-end_pos
103     QString error = tr("Can not parse locus string: %1").arg(locus);
104 
105     QStringList tokens = locus.split(LOCUS_SEP1);
106     CHECK_EXT(tokens.size() == 2, os.setError(error), false);
107     QString name = tokens[0];
108     tokens = tokens[1].split(LOCUS_SEP2);
109     CHECK_EXT(tokens.size() == 2, os.setError(error), false);
110 
111     U2Region region;
112     bool ok = false;
113     region.startPos = tokens[0].toLongLong(&ok);
114     CHECK_EXT(ok, os.setError(error), false);
115     qint64 end = tokens[1].toLongLong(&ok);
116     CHECK_EXT(ok, os.setError(error), false);
117     CHECK_EXT(region.startPos < end, os.setError(error), false);
118     region.length = end - region.startPos + 1;
119 
120     data->qualifiers << U2Qualifier(CHROMOSOME, name);
121     data->location->regions << region;
122     return true;
123 }
124 
parseAnnotations(const ColumnDataParser & parser,IOAdapterReader & reader,U2OpStatus & os)125 QList<SharedAnnotationData> DifferentialFormat::parseAnnotations(const ColumnDataParser &parser, IOAdapterReader &reader, U2OpStatus &os) {
126     QList<SharedAnnotationData> anns;
127     U2OpStatus2Log logOs;
128     while (!reader.atEnd()) {
129         QString line = reader.readLine(os, MAX_LINE_LENGTH);
130         CHECK_OP(os, {});
131         if (line.isEmpty()) {
132             continue;
133         }
134 
135         ColumnDataParser::Iterator values = parser.parseLine(line, os);
136         CHECK_OP(os, anns);
137         SharedAnnotationData data(new AnnotationData());
138         bool locusFound = false;
139         while (values.isEnded()) {
140             QString value = values.look();
141             QString name = values.currentName();
142             if (values.currentType() == ColumnDataParser::INTEGER) {
143                 values.takeInt(logOs);
144             } else if (values.currentType() == ColumnDataParser::DOUBLE) {
145                 values.takeDouble(logOs);
146             } else {
147                 values.takeString();
148             }
149             if (name == LOCUS_COLUMN) {
150                 locusFound = parseLocus(value, data, logOs);
151             } else {
152                 data->qualifiers << U2Qualifier(name, value);
153             }
154         }
155         if (!locusFound) {
156             continue;
157         }
158         data->name = getAnnotationName();
159         anns << data;
160     }
161     return anns;
162 }
163 
loadTextDocument(IOAdapterReader & reader,const U2DbiRef & dbiRef,const QVariantMap & hints,U2OpStatus & os)164 Document *DifferentialFormat::loadTextDocument(IOAdapterReader &reader, const U2DbiRef &dbiRef, const QVariantMap &hints, U2OpStatus &os) {
165     DbiOperationsBlock opBlock(dbiRef, os);
166     CHECK_OP(os, nullptr);
167 
168     QList<SharedAnnotationData> anns = parseAnnotations(reader, os);
169     CHECK_OP(os, nullptr);
170 
171     QVariantMap objectHints;
172     objectHints.insert(DBI_FOLDER_HINT, hints.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER));
173     auto obj = new AnnotationTableObject(getAnnotationName(), dbiRef, objectHints);
174     obj->addAnnotations(anns);
175 
176     return new Document(this, reader.getFactory(), reader.getURL(), dbiRef, QList<GObject *>() << obj, hints);
177 }
178 
parseAnnotations(IOAdapterReader & reader,U2OpStatus & os)179 QList<SharedAnnotationData> DifferentialFormat::parseAnnotations(IOAdapterReader &reader, U2OpStatus &os) {
180     ColumnDataParser parser(getColumns(), SEPARATOR);
181     QString headerLine = reader.readLine(os, MAX_LINE_LENGTH);
182     CHECK_OP(os, {});
183     parser.init(headerLine, os);
184     CHECK_OP(os, {});
185 
186     return parseAnnotations(parser, reader, os);
187 }
188 
writeHeader(IOAdapterWriter & writer,const QList<ColumnDataParser::Column> & columns,U2OpStatus & os)189 void DifferentialFormat::writeHeader(IOAdapterWriter &writer, const QList<ColumnDataParser::Column> &columns, U2OpStatus &os) {
190     QString headerLine;
191     for (const ColumnDataParser::Column &column : qAsConst(columns)) {
192         headerLine += (headerLine.isEmpty() ? "" : SEPARATOR) + column.name;
193     }
194     headerLine += "\n";
195     writer.write(os, headerLine);
196 }
197 
createLocus(const SharedAnnotationData & data,U2OpStatus & os)198 QString DifferentialFormat::createLocus(const SharedAnnotationData &data, U2OpStatus &os) {
199     if (data->location->isEmpty()) {
200         os.setError(tr("Annotation has not regions"));
201         return "";
202     }
203     if (data->location->regions.size() > 1) {
204         os.setError(tr("Annotation has more than one region"));
205         return "";
206     }
207     U2Region region = data->location->regions.first();
208 
209     QVector<U2Qualifier> quals;
210     data->findQualifiers(CHROMOSOME, quals);
211     QString chr = UNKNOWN_CHR;
212     if (!quals.isEmpty()) {
213         chr = quals.first().value;
214     }
215     return chr + LOCUS_SEP1 + QString::number(region.startPos) + LOCUS_SEP2 + QString::number(region.endPos() - 1);
216 }
217 
createValue(const SharedAnnotationData & data,const ColumnDataParser::Column & column,U2OpStatus & os)218 QString DifferentialFormat::createValue(const SharedAnnotationData &data, const ColumnDataParser::Column &column, U2OpStatus &os) {
219     QVector<U2Qualifier> quals;
220     data->findQualifiers(column.name, quals);
221     if (!quals.isEmpty()) {
222         return quals.first().value;
223     } else if (column.required) {
224         os.setError(tr("Required value is missed: %1").arg(column.name));
225         return "";
226     }
227     return column.defaultValue;
228 }
229 
getHeaderColumns(const QList<GObject * > & annObjs,U2OpStatus & os)230 QList<ColumnDataParser::Column> DifferentialFormat::getHeaderColumns(const QList<GObject *> &annObjs, U2OpStatus &os) {
231     QList<ColumnDataParser::Column> allPossibleColumns = getColumns();
232     if (annObjs.isEmpty()) {
233         return allPossibleColumns;
234     }
235 
236     auto annObj = dynamic_cast<AnnotationTableObject *>(annObjs.first());
237     if (annObj == nullptr) {
238         os.setError(tr("Annotation object not found"));
239         return {};
240     }
241 
242     if (!annObj->hasAnnotations()) {
243         return allPossibleColumns;
244     }
245 
246     QList<ColumnDataParser::Column> resultColumns;
247     Annotation *ann = annObj->getAnnotations().first();
248     for (const ColumnDataParser::Column &column : qAsConst(allPossibleColumns)) {
249         if (column.name == LOCUS_COLUMN) {
250             resultColumns << column;
251             continue;
252         }
253         QList<U2Qualifier> quals;
254         ann->findQualifiers(column.name, quals);
255         if (!quals.isEmpty()) {
256             resultColumns << column;
257         }
258     }
259     for (const ColumnDataParser::Column &column : qAsConst(allPossibleColumns)) {
260         if (column.required && !resultColumns.contains(column)) {
261             os.setError(tr("Required column is missed: %1").arg(column.name));
262             return resultColumns;
263         }
264     }
265     return resultColumns;
266 }
267 
storeTextDocument(IOAdapterWriter & writer,Document * document,U2OpStatus & os)268 void DifferentialFormat::storeTextDocument(IOAdapterWriter &writer, Document *document, U2OpStatus &os) {
269     QList<GObject *> annotationObjects = document->findGObjectByType(GObjectTypes::ANNOTATION_TABLE);
270     QList<ColumnDataParser::Column> columns = getHeaderColumns(annotationObjects, os);
271     CHECK_OP(os, );
272     writeHeader(writer, columns, os);
273     CHECK_OP(os, );
274     for (const GObject *obj : qAsConst(annotationObjects)) {
275         auto annObj = dynamic_cast<const AnnotationTableObject *>(obj);
276         SAFE_POINT(annObj != nullptr, "NULL annotation object", );
277         QList<Annotation *> annotations = annObj->getAnnotations();
278         for (const Annotation *ann : qAsConst(annotations)) {
279             QString line;
280             U2OpStatus2Log logOs;
281             for (const ColumnDataParser::Column &column : qAsConst(columns)) {
282                 line += line.isEmpty() ? "" : SEPARATOR;
283                 if (column.name == LOCUS_COLUMN) {
284                     line += createLocus(ann->getData(), logOs);
285                 } else {
286                     line += createValue(ann->getData(), column, logOs);
287                 }
288             }
289             if (logOs.hasError()) {
290                 continue;
291             }
292             line += "\n";
293             writer.write(os, line);
294             CHECK_OP(os, );
295         }
296     }
297 }
298 
299 }  // namespace U2
300