1 /*
2 * $Id: seqid_validate.cpp 632526 2021-06-02 17:25:01Z ivanov $
3 *
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Frank Ludwig Justin Foley
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <objtools/readers/reader_error_codes.hpp>
35 #include <objtools/readers/message_listener.hpp>
36 #include <objtools/readers/alnread.hpp>
37 #include <objtools/readers/reader_error_codes.hpp>
38 #include <objtools/readers/fasta.hpp>
39 #include <objects/general/Object_id.hpp>
40 #include <objects/general/Dbtag.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objtools/readers/aln_error_reporter.hpp>
43 #include <objtools/readers/seqid_validate.hpp>
44
45 BEGIN_NCBI_SCOPE
46 BEGIN_SCOPE(objects);
47
operator ()(const CSeq_id & seqId,int lineNum,CAlnErrorReporter * pErrorReporter)48 void CSeqIdValidate::operator()(const CSeq_id& seqId,
49 int lineNum,
50 CAlnErrorReporter* pErrorReporter)
51 {
52
53 if (!pErrorReporter) {
54 return;
55 }
56
57 if (seqId.IsLocal() &&
58 seqId.GetLocal().IsStr()) {
59 const auto idString = seqId.GetLocal().GetStr();
60
61 bool foundError = false;
62 string description;
63 if (idString.empty()) {
64 description = "Empty local ID.";
65 foundError = true;
66 }
67 else
68 if (idString.size() > 50) {
69 description = "Local ID \"" +
70 idString +
71 " \" exceeds 50 character limit.";
72 foundError = true;
73 }
74 else
75 if (CSeq_id::CheckLocalID(idString) & CSeq_id::fInvalidChar) {
76 description = "Local ID \"" +
77 idString +
78 "\" contains invalid characters.";
79 foundError = true;
80 }
81
82 if (foundError) {
83 pErrorReporter->Error(
84 lineNum,
85 EAlnSubcode::eAlnSubcode_IllegalSequenceId,
86 description);
87 }
88 }
89 // default implementation only checks local IDs
90 }
91
92
93
operator ()(const list<CRef<CSeq_id>> & ids,int lineNum,CAlnErrorReporter * pErrorReporter)94 void CSeqIdValidate::operator()(const list<CRef<CSeq_id>>& ids,
95 int lineNum,
96 CAlnErrorReporter* pErrorReporter) {
97
98 for (auto pSeqId : ids) {
99 operator()(*pSeqId, lineNum, pErrorReporter);
100 }
101 }
102
103
CFastaIdValidate(TFastaFlags flags)104 CFastaIdValidate::CFastaIdValidate(TFastaFlags flags) :
105 m_Flags(flags) {}
106
107
operator ()(const TIds & ids,int lineNum,FReportError fReportError)108 void CFastaIdValidate::operator()(const TIds& ids,
109 int lineNum,
110 FReportError fReportError)
111 {
112 if (ids.empty()) {
113 return;
114 }
115
116 if (!(m_Flags&CFastaReader::fAssumeProt)) {
117 CheckForExcessiveNucData(*ids.back(), lineNum, fReportError);
118 }
119
120 if (!(m_Flags&CFastaReader::fAssumeNuc)) {
121 CheckForExcessiveProtData(*ids.back(), lineNum, fReportError);
122 }
123
124
125 for (const auto& pId : ids) {
126 if (pId->IsLocal() &&
127 !IsValidLocalID(*pId)) {
128 const auto& idString = pId->GetSeqIdString();
129 string msg = "'" + idString + "' is not a valid local ID";
130 fReportError(eDiag_Error, lineNum, idString, CFastaIdValidate::eBadLocalID, msg);
131 }
132 CheckIDLength(*pId, lineNum, fReportError);
133 }
134 }
135
136
SetMaxLocalIDLength(size_t length)137 void CFastaIdValidate::SetMaxLocalIDLength(size_t length)
138 {
139 kMaxLocalIDLength = length;
140 }
141
142
SetMaxGeneralTagLength(size_t length)143 void CFastaIdValidate::SetMaxGeneralTagLength(size_t length)
144 {
145 kMaxGeneralTagLength = length;
146 }
147
148
SetMaxAccessionLength(size_t length)149 void CFastaIdValidate::SetMaxAccessionLength(size_t length)
150 {
151 kMaxAccessionLength = length;
152 }
153
154
s_GetIDLengthErrorString(int length,const string & idType,int maxAllowedLength,int lineNum)155 static string s_GetIDLengthErrorString(int length,
156 const string& idType,
157 int maxAllowedLength,
158 int lineNum)
159 {
160 string err_message =
161 "Near line " + NStr::NumericToString(lineNum) +
162 + ", the " + idType + " is too long. Its length is " + NStr::NumericToString(length)
163 + " but the maximum allowed " + idType + " length is "+ NStr::NumericToString(maxAllowedLength)
164 + ". Please find and correct all " + idType + "s that are too long.";
165
166 return err_message;
167 }
168
169
CheckIDLength(const CSeq_id & id,int lineNum,FReportError fReportError) const170 void CFastaIdValidate::CheckIDLength(const CSeq_id& id, int lineNum, FReportError fReportError) const
171 {
172 if (id.IsLocal()) {
173 if (id.GetLocal().IsStr() &&
174 id.GetLocal().GetStr().length() > kMaxLocalIDLength) {
175 const auto& msg =
176 s_GetIDLengthErrorString(id.GetLocal().GetStr().length(),
177 "local id",
178 kMaxLocalIDLength,
179 lineNum);
180 fReportError(eDiag_Error, lineNum, id.GetSeqIdString(), CFastaIdValidate::eIDTooLong, msg);
181 }
182 return;
183 }
184
185 if (id.IsGeneral()) {
186 if (id.GetGeneral().IsSetTag() &&
187 id.GetGeneral().GetTag().IsStr()) {
188 const auto length = id.GetGeneral().GetTag().GetStr().length();
189 if (length > kMaxGeneralTagLength) {
190 const auto& msg =
191 s_GetIDLengthErrorString(id.GetGeneral().GetTag().GetStr().length(),
192 "general id string",
193 kMaxGeneralTagLength,
194 lineNum);
195 fReportError(eDiag_Error, lineNum, id.GetSeqIdString(), CFastaIdValidate::eIDTooLong, msg);
196 }
197 }
198 return;
199 }
200
201 auto pTextId = id.GetTextseq_Id();
202 if (pTextId && pTextId->IsSetAccession()) {
203 const auto length = pTextId->GetAccession().length();
204 if (length > kMaxAccessionLength) {
205 const auto& msg =
206 s_GetIDLengthErrorString(length,
207 "accession",
208 kMaxAccessionLength,
209 lineNum);
210 fReportError(eDiag_Error, lineNum, id.GetSeqIdString(), CFastaIdValidate::eIDTooLong, msg);
211 }
212 }
213 }
214
215
IsValidLocalID(const CSeq_id & id) const216 bool CFastaIdValidate::IsValidLocalID(const CSeq_id& id) const
217 {
218 if (id.IsLocal()) {
219 if (id.GetLocal().IsId()) {
220 return true;
221 }
222 if (id.GetLocal().IsStr()) {
223 return IsValidLocalString(id.GetLocal().GetStr());
224 }
225 }
226 return false;
227 }
228
229
IsValidLocalString(const CTempString & idString) const230 bool CFastaIdValidate::IsValidLocalString(const CTempString& idString) const
231 {
232 const CTempString& checkString =
233 (m_Flags & CFastaReader::fQuickIDCheck) ?
234 idString.substr(0,1) :
235 idString;
236
237 return !(CSeq_id::CheckLocalID(checkString)&CSeq_id::fInvalidChar);
238 }
239
240
CheckForExcessiveNucData(const CSeq_id & id,int lineNum,FReportError fReportError) const241 void CFastaIdValidate::CheckForExcessiveNucData(
242 const CSeq_id& id,
243 int lineNum,
244 FReportError fReportError
245 ) const
246 {
247 const auto& idString = id.GetSeqIdString();
248 if (idString.length() > kWarnNumNucCharsAtEnd) {
249 TSeqPos numNucChars = CountPossibleNucResidues(idString);
250 if (numNucChars > kWarnNumNucCharsAtEnd) {
251 const string err_message =
252 "Fasta Reader: sequence id ends with " +
253 NStr::NumericToString(numNucChars) +
254 " valid nucleotide characters. " +
255 " Was the sequence accidentally placed in the definition line?";
256
257 auto severity = (numNucChars > kErrNumNucCharsAtEnd) ?
258 eDiag_Error :
259 eDiag_Warning;
260
261 fReportError(severity, lineNum, idString, eUnexpectedNucResidues, err_message);
262 return;
263 }
264 }
265 }
266
267
s_IsPossibleNuc(unsigned char c)268 static bool s_IsPossibleNuc(unsigned char c)
269 {
270 switch( c ) {
271 case 'N':
272 case 'A':
273 case 'C':
274 case 'G':
275 case 'T':
276 case 'a':
277 case 'c':
278 case 'g':
279 case 't':
280 return true;
281 default:
282 return false;
283 }
284 }
285
286
CountPossibleNucResidues(const string & idString)287 size_t CFastaIdValidate::CountPossibleNucResidues(
288 const string& idString)
289 {
290 const auto first_it = rbegin(idString);
291 const auto it = find_if_not(first_it, rend(idString), s_IsPossibleNuc);
292
293 return static_cast<size_t>(distance(first_it, it));
294 }
295
296
CheckForExcessiveProtData(const CSeq_id & id,int lineNum,FReportError fReportError) const297 void CFastaIdValidate::CheckForExcessiveProtData(
298 const CSeq_id& id,
299 int lineNum,
300 FReportError fReportError) const
301 {
302 const auto& idString = id.GetSeqIdString();
303
304 // Check for Aa sequence
305 if (idString.length() > kWarnNumAminoAcidCharsAtEnd) {
306 const auto numAaChars = CountPossibleAminoAcids(idString);
307 if (numAaChars > kWarnNumAminoAcidCharsAtEnd) {
308 const string err_message =
309 "Fasta Reader: sequence id ends with " +
310 NStr::NumericToString(numAaChars) +
311 " valid amino-acid characters. " +
312 " Was the sequence accidentally placed in the definition line?";
313 fReportError(eDiag_Warning, lineNum, idString, eUnexpectedAminoAcids, err_message);
314 }
315 }
316 }
317
318
CountPossibleAminoAcids(const string & idString)319 size_t CFastaIdValidate::CountPossibleAminoAcids(
320 const string& idString)
321 {
322 const auto first_it = rbegin(idString);
323 const auto it = find_if_not(first_it, rend(idString),
324 [](char c) { return (c >= 'A' && c <= 'Z') ||
325 (c >= 'a' && c <= 'z'); });
326
327 return static_cast<size_t>(distance(first_it, it));
328 }
329
330 END_SCOPE(objects)
331 END_NCBI_SCOPE
332