1 /*
2  * $Id: seqid_validate.cpp 632526 2021-06-02 17:25:01Z ivanov $
3  *
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors:  Frank Ludwig Justin Foley
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <objtools/readers/reader_error_codes.hpp>
35 #include <objtools/readers/message_listener.hpp>
36 #include <objtools/readers/alnread.hpp>
37 #include <objtools/readers/reader_error_codes.hpp>
38 #include <objtools/readers/fasta.hpp>
39 #include <objects/general/Object_id.hpp>
40 #include <objects/general/Dbtag.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objtools/readers/aln_error_reporter.hpp>
43 #include <objtools/readers/seqid_validate.hpp>
44 
45 BEGIN_NCBI_SCOPE
46 BEGIN_SCOPE(objects);
47 
operator ()(const CSeq_id & seqId,int lineNum,CAlnErrorReporter * pErrorReporter)48 void CSeqIdValidate::operator()(const CSeq_id& seqId,
49         int lineNum,
50         CAlnErrorReporter* pErrorReporter)
51 {
52 
53     if (!pErrorReporter) {
54         return;
55     }
56 
57     if (seqId.IsLocal() &&
58         seqId.GetLocal().IsStr()) {
59         const auto idString = seqId.GetLocal().GetStr();
60 
61         bool foundError = false;
62         string description;
63         if (idString.empty()) {
64             description = "Empty local ID.";
65             foundError = true;
66         }
67         else
68         if (idString.size() > 50) {
69             description = "Local ID \"" +
70                           idString +
71                           " \" exceeds 50 character limit.";
72             foundError = true;
73         }
74         else
75         if (CSeq_id::CheckLocalID(idString) & CSeq_id::fInvalidChar) {
76             description = "Local ID \"" +
77                           idString +
78                           "\" contains invalid characters.";
79             foundError = true;
80         }
81 
82         if (foundError) {
83             pErrorReporter->Error(
84                     lineNum,
85                     EAlnSubcode::eAlnSubcode_IllegalSequenceId,
86                     description);
87         }
88     }
89     // default implementation only checks local IDs
90 }
91 
92 
93 
operator ()(const list<CRef<CSeq_id>> & ids,int lineNum,CAlnErrorReporter * pErrorReporter)94 void CSeqIdValidate::operator()(const list<CRef<CSeq_id>>& ids,
95         int lineNum,
96         CAlnErrorReporter* pErrorReporter) {
97 
98     for (auto pSeqId : ids) {
99         operator()(*pSeqId, lineNum, pErrorReporter);
100     }
101 }
102 
103 
CFastaIdValidate(TFastaFlags flags)104 CFastaIdValidate::CFastaIdValidate(TFastaFlags flags) :
105     m_Flags(flags) {}
106 
107 
operator ()(const TIds & ids,int lineNum,FReportError fReportError)108 void CFastaIdValidate::operator()(const TIds& ids,
109         int lineNum,
110         FReportError fReportError)
111 {
112     if (ids.empty()) {
113         return;
114     }
115 
116     if (!(m_Flags&CFastaReader::fAssumeProt)) {
117         CheckForExcessiveNucData(*ids.back(), lineNum, fReportError);
118     }
119 
120     if (!(m_Flags&CFastaReader::fAssumeNuc)) {
121         CheckForExcessiveProtData(*ids.back(), lineNum, fReportError);
122     }
123 
124 
125     for (const auto& pId : ids) {
126         if (pId->IsLocal() &&
127             !IsValidLocalID(*pId)) {
128             const auto& idString = pId->GetSeqIdString();
129             string msg = "'" + idString + "' is not a valid local ID";
130             fReportError(eDiag_Error, lineNum, idString, CFastaIdValidate::eBadLocalID, msg);
131         }
132         CheckIDLength(*pId, lineNum, fReportError);
133     }
134 }
135 
136 
SetMaxLocalIDLength(size_t length)137 void CFastaIdValidate::SetMaxLocalIDLength(size_t length)
138 {
139     kMaxLocalIDLength = length;
140 }
141 
142 
SetMaxGeneralTagLength(size_t length)143 void CFastaIdValidate::SetMaxGeneralTagLength(size_t length)
144 {
145     kMaxGeneralTagLength = length;
146 }
147 
148 
SetMaxAccessionLength(size_t length)149 void CFastaIdValidate::SetMaxAccessionLength(size_t length)
150 {
151     kMaxAccessionLength = length;
152 }
153 
154 
s_GetIDLengthErrorString(int length,const string & idType,int maxAllowedLength,int lineNum)155 static string s_GetIDLengthErrorString(int length,
156         const string& idType,
157         int maxAllowedLength,
158         int lineNum)
159 {
160     string err_message =
161         "Near line " + NStr::NumericToString(lineNum) +
162         + ", the " + idType + " is too long.  Its length is " + NStr::NumericToString(length)
163         + " but the maximum allowed " + idType + " length is "+  NStr::NumericToString(maxAllowedLength)
164         + ".  Please find and correct all " + idType + "s that are too long.";
165 
166     return err_message;
167 }
168 
169 
CheckIDLength(const CSeq_id & id,int lineNum,FReportError fReportError) const170 void CFastaIdValidate::CheckIDLength(const CSeq_id& id, int lineNum, FReportError fReportError) const
171 {
172     if (id.IsLocal()) {
173         if (id.GetLocal().IsStr() &&
174             id.GetLocal().GetStr().length() > kMaxLocalIDLength) {
175             const auto& msg =
176                 s_GetIDLengthErrorString(id.GetLocal().GetStr().length(),
177                         "local id",
178                         kMaxLocalIDLength,
179                         lineNum);
180             fReportError(eDiag_Error, lineNum, id.GetSeqIdString(), CFastaIdValidate::eIDTooLong, msg);
181         }
182         return;
183     }
184 
185     if (id.IsGeneral()) {
186         if (id.GetGeneral().IsSetTag() &&
187             id.GetGeneral().GetTag().IsStr()) {
188             const auto length = id.GetGeneral().GetTag().GetStr().length();
189             if (length > kMaxGeneralTagLength) {
190                 const auto& msg =
191                     s_GetIDLengthErrorString(id.GetGeneral().GetTag().GetStr().length(),
192                             "general id string",
193                             kMaxGeneralTagLength,
194                             lineNum);
195                 fReportError(eDiag_Error, lineNum, id.GetSeqIdString(), CFastaIdValidate::eIDTooLong, msg);
196             }
197         }
198         return;
199     }
200 
201     auto pTextId = id.GetTextseq_Id();
202     if (pTextId && pTextId->IsSetAccession()) {
203         const auto length = pTextId->GetAccession().length();
204         if (length > kMaxAccessionLength) {
205             const auto& msg =
206                 s_GetIDLengthErrorString(length,
207                                 "accession",
208                                 kMaxAccessionLength,
209                                 lineNum);
210             fReportError(eDiag_Error, lineNum, id.GetSeqIdString(), CFastaIdValidate::eIDTooLong, msg);
211         }
212     }
213 }
214 
215 
IsValidLocalID(const CSeq_id & id) const216 bool CFastaIdValidate::IsValidLocalID(const CSeq_id& id) const
217 {
218     if (id.IsLocal()) {
219         if (id.GetLocal().IsId()) {
220             return true;
221         }
222         if (id.GetLocal().IsStr()) {
223             return IsValidLocalString(id.GetLocal().GetStr());
224         }
225     }
226     return false;
227 }
228 
229 
IsValidLocalString(const CTempString & idString) const230 bool CFastaIdValidate::IsValidLocalString(const CTempString& idString) const
231 {
232     const CTempString& checkString =
233         (m_Flags & CFastaReader::fQuickIDCheck) ?
234         idString.substr(0,1) :
235         idString;
236 
237     return !(CSeq_id::CheckLocalID(checkString)&CSeq_id::fInvalidChar);
238 }
239 
240 
CheckForExcessiveNucData(const CSeq_id & id,int lineNum,FReportError fReportError) const241 void CFastaIdValidate::CheckForExcessiveNucData(
242     const CSeq_id& id,
243     int lineNum,
244     FReportError fReportError
245 ) const
246 {
247     const auto& idString = id.GetSeqIdString();
248     if (idString.length() > kWarnNumNucCharsAtEnd) {
249         TSeqPos numNucChars = CountPossibleNucResidues(idString);
250         if (numNucChars > kWarnNumNucCharsAtEnd) {
251             const string err_message =
252             "Fasta Reader: sequence id ends with " +
253             NStr::NumericToString(numNucChars) +
254             " valid nucleotide characters. " +
255             " Was the sequence accidentally placed in the definition line?";
256 
257             auto severity = (numNucChars > kErrNumNucCharsAtEnd) ?
258                             eDiag_Error :
259                             eDiag_Warning;
260 
261             fReportError(severity, lineNum, idString, eUnexpectedNucResidues, err_message);
262             return;
263         }
264     }
265 }
266 
267 
s_IsPossibleNuc(unsigned char c)268 static bool s_IsPossibleNuc(unsigned char c)
269 {
270     switch( c ) {
271     case 'N':
272     case 'A':
273     case 'C':
274     case 'G':
275     case 'T':
276     case 'a':
277     case 'c':
278     case 'g':
279     case 't':
280         return true;
281     default:
282         return false;
283     }
284 }
285 
286 
CountPossibleNucResidues(const string & idString)287 size_t CFastaIdValidate::CountPossibleNucResidues(
288         const string& idString)
289 {
290     const auto first_it = rbegin(idString);
291     const auto it = find_if_not(first_it, rend(idString), s_IsPossibleNuc);
292 
293     return static_cast<size_t>(distance(first_it, it));
294 }
295 
296 
CheckForExcessiveProtData(const CSeq_id & id,int lineNum,FReportError fReportError) const297 void CFastaIdValidate::CheckForExcessiveProtData(
298         const CSeq_id& id,
299         int lineNum,
300         FReportError fReportError) const
301 {
302     const auto& idString = id.GetSeqIdString();
303 
304     // Check for Aa sequence
305     if (idString.length() > kWarnNumAminoAcidCharsAtEnd) {
306         const auto numAaChars = CountPossibleAminoAcids(idString);
307         if (numAaChars > kWarnNumAminoAcidCharsAtEnd) {
308             const string err_message =
309             "Fasta Reader: sequence id ends with " +
310             NStr::NumericToString(numAaChars) +
311             " valid amino-acid characters. " +
312             " Was the sequence accidentally placed in the definition line?";
313             fReportError(eDiag_Warning, lineNum, idString, eUnexpectedAminoAcids, err_message);
314         }
315     }
316 }
317 
318 
CountPossibleAminoAcids(const string & idString)319 size_t CFastaIdValidate::CountPossibleAminoAcids(
320         const string& idString)
321 {
322     const auto first_it = rbegin(idString);
323     const auto it = find_if_not(first_it, rend(idString),
324             [](char c) { return (c >= 'A' && c <= 'Z') ||
325                                 (c >= 'a' && c <= 'z'); });
326 
327     return static_cast<size_t>(distance(first_it, it));
328 }
329 
330 END_SCOPE(objects)
331 END_NCBI_SCOPE
332