1 /*
2 * $Id: aln_util.cpp 632526 2021-06-02 17:25:01Z ivanov $
3 *
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Frank Ludwig
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <objtools/readers/reader_error_codes.hpp>
35 #include <objtools/readers/message_listener.hpp>
36 #include <objtools/readers/alnread.hpp>
37 #include <objtools/readers/reader_error_codes.hpp>
38 #include "aln_errors.hpp"
39 #include "aln_util.hpp"
40
41 BEGIN_NCBI_SCOPE
42 BEGIN_SCOPE(objects);
43
44
45 // --------------------------------------------------------------------------
46 void
CheckId(const string & seqId,const vector<SLineInfo> & orderedIds,int idCount,int lineNum,bool firstBlock)47 AlnUtil::CheckId(const string& seqId,
48 const vector<SLineInfo>& orderedIds,
49 int idCount,
50 int lineNum,
51 bool firstBlock)
52 // --------------------------------------------------------------------------
53 {
54 string description;
55
56 if ((orderedIds.size() > idCount) &&
57 seqId == orderedIds[idCount].mData) {
58 return;
59 }
60
61
62 string seqIdLower(seqId);
63 NStr::ToLower(seqIdLower);
64
65 auto it = orderedIds.begin();
66 bool exactCopy = false;
67 while (it != orderedIds.end()) {
68 if (it->mData == seqId) {
69 exactCopy = true;
70 break;
71 }
72 auto orderedIdLower(it->mData);
73 NStr::ToLower(orderedIdLower);
74 if (orderedIdLower == seqIdLower) {
75 break;
76 }
77 ++it;
78 }
79
80
81 if (firstBlock) {
82 if (it != orderedIds.end()) {
83 if (exactCopy) {
84 description = ErrorPrintf(
85 "Duplicate ID: \"%s\" has already appeared in this block, on line %d.", seqId.c_str(), it->mNumLine);
86 }
87 else {
88 description = ErrorPrintf(
89 "Conflicting IDs: \"%s\" differs only in case from \"%s\", which has already appeared in this block, on line %d.", seqId.c_str(), it->mData.c_str(), it->mNumLine);
90 }
91 throw SShowStopper(
92 lineNum,
93 eAlnSubcode_UnexpectedSeqId,
94 description);
95
96 }
97 return;
98 }
99
100 if (it == orderedIds.end()) {
101 description = ErrorPrintf(
102 "Inconsistent sequence_IDs in the data blocks. Each data block must contain the same set of sequence_IDs.");
103 throw SShowStopper(
104 lineNum,
105 eAlnSubcode_BadSequenceCount,
106 description);
107 }
108
109
110 auto idPos = distance(orderedIds.begin(), it);
111 if (idPos < idCount) {
112 if (exactCopy) {
113 description = ErrorPrintf(
114 "Duplicate ID: \"%s \" has already appeared in this block, on line %d.",
115 seqId.c_str(), it->mNumLine);
116 }
117 else {
118 description = ErrorPrintf(
119 "Conflicting IDs: \"%s\" differs only in case from \"%s\", which has already appeared in this block, on line %d.", seqId.c_str(), it->mData.c_str(), it->mNumLine);
120 }
121 }
122 else
123 if (idPos == idCount) { //
124 description = ErrorPrintf(
125 "Inconsistent ID case: \"%s\" differs in case from \"%s\" used to identify this sequence in the first block.",
126 seqId.c_str(), it->mData.c_str());
127 }
128 else
129 {
130 description = "Sequence_IDs are in different orders in the data blocks in your file. The sequences and sequence_IDs are expected to be in the same order in each block.";
131 }
132 throw SShowStopper(
133 lineNum,
134 eAlnSubcode_UnexpectedSeqId,
135 description);
136 }
137
138
139 // --------------------------------------------------------------------------
140 void
ProcessDefline(const string & line,string & seqId,string & defLineInfo)141 AlnUtil::ProcessDefline(
142 const string& line,
143 string& seqId,
144 string& defLineInfo)
145 // --------------------------------------------------------------------------
146 {
147 if (!NStr::StartsWith(line, ">")) {
148 throw SShowStopper(
149 -1,
150 eAlnSubcode_IllegalDataLine,
151 "Deflines were detected in your file, however some lines are missing the \'>\' character at the beginning of the line. Each defline must begin with \'>\'.");
152 }
153 auto dataStart = line.find_first_not_of(" \t", 1);
154 if (dataStart == string::npos) {
155 throw SShowStopper(
156 -1,
157 eAlnSubcode_IllegalDataLine,
158 "Bad defline line: Should not be empty");
159 }
160 string defLine = line.substr(dataStart);
161 if (NStr::StartsWith(defLine, "[")) {
162 seqId.clear();
163 defLineInfo = defLine;
164 }
165 else {
166 NStr::SplitInTwo(defLine.substr(dataStart), " \t", seqId, defLineInfo,
167 NStr::fSplit_MergeDelimiters);
168 }
169 }
170
171 // ----------------------------------------------------------------------------
172 void
ProcessDataLine(const string & dataLine,string & seqId,string & seqData,int & offset)173 AlnUtil::ProcessDataLine(
174 const string& dataLine,
175 string& seqId,
176 string& seqData,
177 int& offset)
178 // ----------------------------------------------------------------------------
179 {
180 list<string> tokens;
181 NStr::Split(dataLine, " \t", tokens, NStr::fSplit_MergeDelimiters);
182 if (tokens.size() < 2) {
183 throw SShowStopper(
184 -1,
185 eAlnSubcode_IllegalDataLine,
186 "Bad data line: Expected \"<seqId> <data> <offset>\"");
187 }
188 seqId = tokens.front();
189 tokens.pop_front();
190 if (tokens.back().find_first_not_of("0123456789") == string::npos) {
191 // trailing token is offset
192 offset = NStr::StringToInt(tokens.back());
193 tokens.pop_back();
194 }
195 else {
196 // trailing token is part of the data
197 }
198 seqData = NStr::Join(tokens, "");
199 }
200
201 // ----------------------------------------------------------------------------
202 void
ProcessDataLine(const string & dataLine,string & seqId,string & seqData)203 AlnUtil::ProcessDataLine(
204 const string& dataLine,
205 string& seqId,
206 string& seqData)
207 // ----------------------------------------------------------------------------
208 {
209 list<string> tokens;
210 NStr::Split(dataLine, " \t", tokens, NStr::fSplit_MergeDelimiters);
211 if (tokens.size() < 2) {
212 throw SShowStopper(
213 -1,
214 eAlnSubcode_IllegalDataLine,
215 "Bad data line: Expected \"<seqId> <data> <offset>\"");
216 }
217 seqId = tokens.front();
218 tokens.pop_front();
219 seqData = NStr::Join(tokens, "");
220 }
221
222 // ----------------------------------------------------------------------------
223 void
StripBlanks(const string & line,string & stripped)224 AlnUtil::StripBlanks(
225 const string& line,
226 string& stripped)
227 // ----------------------------------------------------------------------------
228 {
229 stripped = NStr::TruncateSpaces(line);
230 vector<string> splits;
231 NStr::Split(stripped, " \t", splits, NStr::fSplit_MergeDelimiters);
232 stripped = NStr::Join(splits, "");
233 }
234
235 END_SCOPE(objects)
236 END_NCBI_SCOPE
237