1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "DifferentialFormat.h"
23
24 #include <U2Core/AnnotationTableObject.h>
25 #include <U2Core/BaseDocumentFormats.h>
26 #include <U2Core/IOAdapter.h>
27 #include <U2Core/IOAdapterTextStream.h>
28 #include <U2Core/U2DbiUtils.h>
29 #include <U2Core/U2ObjectDbi.h>
30 #include <U2Core/U2OpStatusUtils.h>
31 #include <U2Core/U2SafePoints.h>
32
33 namespace U2 {
34
35 static const QString SEPARATOR("\t");
36 static const QString LOCUS_COLUMN("locus");
37 static const QString LOCUS_SEP1(":");
38 static const QString LOCUS_SEP2("-");
39 static const QString CHROMOSOME("chromosome");
40 static const QString UNKNOWN_CHR("unknown");
41
DifferentialFormat(QObject * parent)42 DifferentialFormat::DifferentialFormat(QObject *parent)
43 : TextDocumentFormat(parent, BaseDocumentFormats::DIFF, DocumentFormatFlags_W1, {"diff"}) {
44 formatName = tr("Differential");
45 supportedObjectTypes += GObjectTypes::ANNOTATION_TABLE;
46 formatDescription = tr("Differential format is a text-based format for"
47 " representing Cuffdiff differential output files: expression,"
48 " splicing, promoters and cds.");
49 }
50
getColumns()51 QList<ColumnDataParser::Column> DifferentialFormat::getColumns() {
52 return {
53 ColumnDataParser::Column("test_id", ColumnDataParser::STRING, "-", true /*required*/),
54 ColumnDataParser::Column("gene_id", ColumnDataParser::STRING, "-"),
55 ColumnDataParser::Column("gene", ColumnDataParser::STRING, "-"),
56 ColumnDataParser::Column(LOCUS_COLUMN, ColumnDataParser::STRING, "-", true /*required*/),
57 ColumnDataParser::Column("sample_1", ColumnDataParser::STRING, "-", true /*required*/),
58 ColumnDataParser::Column("sample_2", ColumnDataParser::STRING, "-", true /*required*/),
59 ColumnDataParser::Column("status", ColumnDataParser::STRING, "-", true /*required*/),
60 ColumnDataParser::Column("value_1", ColumnDataParser::DOUBLE, "1"),
61 ColumnDataParser::Column("value_2", ColumnDataParser::DOUBLE, "1"),
62 ColumnDataParser::Column("log2(fold_change)", ColumnDataParser::DOUBLE, "0"),
63 ColumnDataParser::Column("sqrt(JS)", ColumnDataParser::DOUBLE, "0"),
64 ColumnDataParser::Column("test_stat", ColumnDataParser::DOUBLE, "0"),
65 ColumnDataParser::Column("p_value", ColumnDataParser::DOUBLE, "1"),
66 ColumnDataParser::Column("q_value", ColumnDataParser::DOUBLE, "1"),
67 ColumnDataParser::Column("significant", ColumnDataParser::STRING, "-", true /*required*/)};
68 }
69
getAnnotationName()70 QString DifferentialFormat::getAnnotationName() {
71 return "differential";
72 }
73
checkRawTextData(const QString & dataPrefix,const GUrl &) const74 FormatCheckResult DifferentialFormat::checkRawTextData(const QString &dataPrefix, const GUrl &) const {
75 QStringList lines = dataPrefix.split("\n", QString::SkipEmptyParts);
76 CHECK(!lines.isEmpty(), FormatDetection_NotMatched);
77
78 ColumnDataParser parser(getColumns(), SEPARATOR);
79 U2OpStatusImpl os;
80 parser.init(lines.takeFirst(), os);
81 CHECK_OP(os, FormatDetection_NotMatched);
82 CHECK(parser.getCurrentColumns().size() > 1, FormatDetection_NotMatched);
83
84 // Check all lines. Skip the last line because it can be cut.
85 for (int i = 0; i < lines.length() - 1; i++) {
86 const QString &line = lines[i];
87 ColumnDataParser::Iterator values = parser.parseLine(line, os);
88 CHECK_OP(os, FormatDetection_NotMatched);
89 bool containsLocus = false;
90 for (; !values.isEnded(); values.takeString()) {
91 if (values.currentName() == LOCUS_COLUMN) {
92 containsLocus = true;
93 }
94 }
95 CHECK(containsLocus, FormatDetection_NotMatched);
96 }
97
98 return FormatDetection_Matched;
99 }
100
parseLocus(const QString & locus,SharedAnnotationData & data,U2OpStatus & os)101 bool DifferentialFormat::parseLocus(const QString &locus, SharedAnnotationData &data, U2OpStatus &os) {
102 // locus == chr_name:start_pos-end_pos
103 QString error = tr("Can not parse locus string: %1").arg(locus);
104
105 QStringList tokens = locus.split(LOCUS_SEP1);
106 CHECK_EXT(tokens.size() == 2, os.setError(error), false);
107 QString name = tokens[0];
108 tokens = tokens[1].split(LOCUS_SEP2);
109 CHECK_EXT(tokens.size() == 2, os.setError(error), false);
110
111 U2Region region;
112 bool ok = false;
113 region.startPos = tokens[0].toLongLong(&ok);
114 CHECK_EXT(ok, os.setError(error), false);
115 qint64 end = tokens[1].toLongLong(&ok);
116 CHECK_EXT(ok, os.setError(error), false);
117 CHECK_EXT(region.startPos < end, os.setError(error), false);
118 region.length = end - region.startPos + 1;
119
120 data->qualifiers << U2Qualifier(CHROMOSOME, name);
121 data->location->regions << region;
122 return true;
123 }
124
parseAnnotations(const ColumnDataParser & parser,IOAdapterReader & reader,U2OpStatus & os)125 QList<SharedAnnotationData> DifferentialFormat::parseAnnotations(const ColumnDataParser &parser, IOAdapterReader &reader, U2OpStatus &os) {
126 QList<SharedAnnotationData> anns;
127 U2OpStatus2Log logOs;
128 while (!reader.atEnd()) {
129 QString line = reader.readLine(os, MAX_LINE_LENGTH);
130 CHECK_OP(os, {});
131 if (line.isEmpty()) {
132 continue;
133 }
134
135 ColumnDataParser::Iterator values = parser.parseLine(line, os);
136 CHECK_OP(os, anns);
137 SharedAnnotationData data(new AnnotationData());
138 bool locusFound = false;
139 while (values.isEnded()) {
140 QString value = values.look();
141 QString name = values.currentName();
142 if (values.currentType() == ColumnDataParser::INTEGER) {
143 values.takeInt(logOs);
144 } else if (values.currentType() == ColumnDataParser::DOUBLE) {
145 values.takeDouble(logOs);
146 } else {
147 values.takeString();
148 }
149 if (name == LOCUS_COLUMN) {
150 locusFound = parseLocus(value, data, logOs);
151 } else {
152 data->qualifiers << U2Qualifier(name, value);
153 }
154 }
155 if (!locusFound) {
156 continue;
157 }
158 data->name = getAnnotationName();
159 anns << data;
160 }
161 return anns;
162 }
163
loadTextDocument(IOAdapterReader & reader,const U2DbiRef & dbiRef,const QVariantMap & hints,U2OpStatus & os)164 Document *DifferentialFormat::loadTextDocument(IOAdapterReader &reader, const U2DbiRef &dbiRef, const QVariantMap &hints, U2OpStatus &os) {
165 DbiOperationsBlock opBlock(dbiRef, os);
166 CHECK_OP(os, nullptr);
167
168 QList<SharedAnnotationData> anns = parseAnnotations(reader, os);
169 CHECK_OP(os, nullptr);
170
171 QVariantMap objectHints;
172 objectHints.insert(DBI_FOLDER_HINT, hints.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER));
173 auto obj = new AnnotationTableObject(getAnnotationName(), dbiRef, objectHints);
174 obj->addAnnotations(anns);
175
176 return new Document(this, reader.getFactory(), reader.getURL(), dbiRef, QList<GObject *>() << obj, hints);
177 }
178
parseAnnotations(IOAdapterReader & reader,U2OpStatus & os)179 QList<SharedAnnotationData> DifferentialFormat::parseAnnotations(IOAdapterReader &reader, U2OpStatus &os) {
180 ColumnDataParser parser(getColumns(), SEPARATOR);
181 QString headerLine = reader.readLine(os, MAX_LINE_LENGTH);
182 CHECK_OP(os, {});
183 parser.init(headerLine, os);
184 CHECK_OP(os, {});
185
186 return parseAnnotations(parser, reader, os);
187 }
188
writeHeader(IOAdapterWriter & writer,const QList<ColumnDataParser::Column> & columns,U2OpStatus & os)189 void DifferentialFormat::writeHeader(IOAdapterWriter &writer, const QList<ColumnDataParser::Column> &columns, U2OpStatus &os) {
190 QString headerLine;
191 for (const ColumnDataParser::Column &column : qAsConst(columns)) {
192 headerLine += (headerLine.isEmpty() ? "" : SEPARATOR) + column.name;
193 }
194 headerLine += "\n";
195 writer.write(os, headerLine);
196 }
197
createLocus(const SharedAnnotationData & data,U2OpStatus & os)198 QString DifferentialFormat::createLocus(const SharedAnnotationData &data, U2OpStatus &os) {
199 if (data->location->isEmpty()) {
200 os.setError(tr("Annotation has not regions"));
201 return "";
202 }
203 if (data->location->regions.size() > 1) {
204 os.setError(tr("Annotation has more than one region"));
205 return "";
206 }
207 U2Region region = data->location->regions.first();
208
209 QVector<U2Qualifier> quals;
210 data->findQualifiers(CHROMOSOME, quals);
211 QString chr = UNKNOWN_CHR;
212 if (!quals.isEmpty()) {
213 chr = quals.first().value;
214 }
215 return chr + LOCUS_SEP1 + QString::number(region.startPos) + LOCUS_SEP2 + QString::number(region.endPos() - 1);
216 }
217
createValue(const SharedAnnotationData & data,const ColumnDataParser::Column & column,U2OpStatus & os)218 QString DifferentialFormat::createValue(const SharedAnnotationData &data, const ColumnDataParser::Column &column, U2OpStatus &os) {
219 QVector<U2Qualifier> quals;
220 data->findQualifiers(column.name, quals);
221 if (!quals.isEmpty()) {
222 return quals.first().value;
223 } else if (column.required) {
224 os.setError(tr("Required value is missed: %1").arg(column.name));
225 return "";
226 }
227 return column.defaultValue;
228 }
229
getHeaderColumns(const QList<GObject * > & annObjs,U2OpStatus & os)230 QList<ColumnDataParser::Column> DifferentialFormat::getHeaderColumns(const QList<GObject *> &annObjs, U2OpStatus &os) {
231 QList<ColumnDataParser::Column> allPossibleColumns = getColumns();
232 if (annObjs.isEmpty()) {
233 return allPossibleColumns;
234 }
235
236 auto annObj = dynamic_cast<AnnotationTableObject *>(annObjs.first());
237 if (annObj == nullptr) {
238 os.setError(tr("Annotation object not found"));
239 return {};
240 }
241
242 if (!annObj->hasAnnotations()) {
243 return allPossibleColumns;
244 }
245
246 QList<ColumnDataParser::Column> resultColumns;
247 Annotation *ann = annObj->getAnnotations().first();
248 for (const ColumnDataParser::Column &column : qAsConst(allPossibleColumns)) {
249 if (column.name == LOCUS_COLUMN) {
250 resultColumns << column;
251 continue;
252 }
253 QList<U2Qualifier> quals;
254 ann->findQualifiers(column.name, quals);
255 if (!quals.isEmpty()) {
256 resultColumns << column;
257 }
258 }
259 for (const ColumnDataParser::Column &column : qAsConst(allPossibleColumns)) {
260 if (column.required && !resultColumns.contains(column)) {
261 os.setError(tr("Required column is missed: %1").arg(column.name));
262 return resultColumns;
263 }
264 }
265 return resultColumns;
266 }
267
storeTextDocument(IOAdapterWriter & writer,Document * document,U2OpStatus & os)268 void DifferentialFormat::storeTextDocument(IOAdapterWriter &writer, Document *document, U2OpStatus &os) {
269 QList<GObject *> annotationObjects = document->findGObjectByType(GObjectTypes::ANNOTATION_TABLE);
270 QList<ColumnDataParser::Column> columns = getHeaderColumns(annotationObjects, os);
271 CHECK_OP(os, );
272 writeHeader(writer, columns, os);
273 CHECK_OP(os, );
274 for (const GObject *obj : qAsConst(annotationObjects)) {
275 auto annObj = dynamic_cast<const AnnotationTableObject *>(obj);
276 SAFE_POINT(annObj != nullptr, "NULL annotation object", );
277 QList<Annotation *> annotations = annObj->getAnnotations();
278 for (const Annotation *ann : qAsConst(annotations)) {
279 QString line;
280 U2OpStatus2Log logOs;
281 for (const ColumnDataParser::Column &column : qAsConst(columns)) {
282 line += line.isEmpty() ? "" : SEPARATOR;
283 if (column.name == LOCUS_COLUMN) {
284 line += createLocus(ann->getData(), logOs);
285 } else {
286 line += createValue(ann->getData(), column, logOs);
287 }
288 }
289 if (logOs.hasError()) {
290 continue;
291 }
292 line += "\n";
293 writer.write(os, line);
294 CHECK_OP(os, );
295 }
296 }
297 }
298
299 } // namespace U2
300