1 /*
2  * $Id: aln_util.cpp 632526 2021-06-02 17:25:01Z ivanov $
3  *
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors:  Frank Ludwig
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <objtools/readers/reader_error_codes.hpp>
35 #include <objtools/readers/message_listener.hpp>
36 #include <objtools/readers/alnread.hpp>
37 #include <objtools/readers/reader_error_codes.hpp>
38 #include "aln_errors.hpp"
39 #include "aln_util.hpp"
40 
41 BEGIN_NCBI_SCOPE
42 BEGIN_SCOPE(objects);
43 
44 
45 //  --------------------------------------------------------------------------
46 void
CheckId(const string & seqId,const vector<SLineInfo> & orderedIds,int idCount,int lineNum,bool firstBlock)47 AlnUtil::CheckId(const string& seqId,
48         const vector<SLineInfo>& orderedIds,
49         int idCount,
50         int lineNum,
51         bool firstBlock)
52 //  --------------------------------------------------------------------------
53 {
54     string description;
55 
56     if ((orderedIds.size() > idCount) &&
57         seqId == orderedIds[idCount].mData) {
58         return;
59     }
60 
61 
62     string seqIdLower(seqId);
63     NStr::ToLower(seqIdLower);
64 
65     auto it = orderedIds.begin();
66     bool exactCopy = false;
67     while (it != orderedIds.end()) {
68         if (it->mData == seqId) {
69             exactCopy = true;
70             break;
71         }
72         auto orderedIdLower(it->mData);
73         NStr::ToLower(orderedIdLower);
74         if (orderedIdLower == seqIdLower) {
75             break;
76         }
77         ++it;
78     }
79 
80 
81     if (firstBlock) {
82         if (it != orderedIds.end()) {
83             if (exactCopy) {
84                 description = ErrorPrintf(
85                     "Duplicate ID: \"%s\" has already appeared in this block, on line %d.", seqId.c_str(), it->mNumLine);
86             }
87             else {
88                 description = ErrorPrintf(
89                 "Conflicting IDs: \"%s\" differs only in case from \"%s\", which has already appeared in this block, on line %d.", seqId.c_str(), it->mData.c_str(), it->mNumLine);
90             }
91             throw SShowStopper(
92                 lineNum,
93                 eAlnSubcode_UnexpectedSeqId,
94                 description);
95 
96         }
97         return;
98     }
99 
100     if (it == orderedIds.end()) {
101         description = ErrorPrintf(
102             "Inconsistent sequence_IDs in the data blocks. Each data block must contain the same set of sequence_IDs.");
103         throw SShowStopper(
104             lineNum,
105             eAlnSubcode_BadSequenceCount,
106             description);
107     }
108 
109 
110     auto idPos = distance(orderedIds.begin(), it);
111     if (idPos < idCount) {
112         if (exactCopy) {
113             description = ErrorPrintf(
114                 "Duplicate ID: \"%s \" has already appeared in this block, on line %d.",
115                 seqId.c_str(), it->mNumLine);
116         }
117         else {
118             description = ErrorPrintf(
119             "Conflicting IDs: \"%s\" differs only in case from \"%s\", which has already appeared in this block, on line %d.", seqId.c_str(), it->mData.c_str(), it->mNumLine);
120         }
121     }
122     else
123     if (idPos == idCount) { //
124         description = ErrorPrintf(
125             "Inconsistent ID case: \"%s\" differs in case from \"%s\" used to identify this sequence in the first block.",
126             seqId.c_str(), it->mData.c_str());
127     }
128     else
129     {
130         description = "Sequence_IDs are in different orders in the data blocks in your file. The sequences and sequence_IDs are expected to be in the same order in each block.";
131     }
132     throw SShowStopper(
133         lineNum,
134         eAlnSubcode_UnexpectedSeqId,
135         description);
136 }
137 
138 
139 //  --------------------------------------------------------------------------
140 void
ProcessDefline(const string & line,string & seqId,string & defLineInfo)141 AlnUtil::ProcessDefline(
142     const string& line,
143     string& seqId,
144     string& defLineInfo)
145 //  --------------------------------------------------------------------------
146 {
147     if (!NStr::StartsWith(line, ">")) {
148         throw SShowStopper(
149             -1,
150             eAlnSubcode_IllegalDataLine,
151             "Deflines were detected in your file, however some lines are missing the \'>\' character at the beginning of the line. Each defline must begin with \'>\'.");
152     }
153     auto dataStart = line.find_first_not_of(" \t", 1);
154     if (dataStart == string::npos) {
155         throw SShowStopper(
156             -1,
157             eAlnSubcode_IllegalDataLine,
158             "Bad defline line: Should not be empty");
159     }
160     string defLine = line.substr(dataStart);
161     if (NStr::StartsWith(defLine, "[")) {
162         seqId.clear();
163         defLineInfo = defLine;
164     }
165     else {
166         NStr::SplitInTwo(defLine.substr(dataStart), " \t", seqId, defLineInfo,
167             NStr::fSplit_MergeDelimiters);
168     }
169 }
170 
171 //  ----------------------------------------------------------------------------
172 void
ProcessDataLine(const string & dataLine,string & seqId,string & seqData,int & offset)173 AlnUtil::ProcessDataLine(
174     const string& dataLine,
175     string& seqId,
176     string& seqData,
177     int& offset)
178 //  ----------------------------------------------------------------------------
179 {
180     list<string> tokens;
181     NStr::Split(dataLine, " \t", tokens, NStr::fSplit_MergeDelimiters);
182     if (tokens.size() < 2) {
183         throw SShowStopper(
184             -1,
185             eAlnSubcode_IllegalDataLine,
186             "Bad data line: Expected \"<seqId> <data> <offset>\"");
187     }
188     seqId = tokens.front();
189     tokens.pop_front();
190     if (tokens.back().find_first_not_of("0123456789") == string::npos) {
191         // trailing token is offset
192         offset = NStr::StringToInt(tokens.back());
193         tokens.pop_back();
194     }
195     else {
196         // trailing token is part of the data
197     }
198     seqData = NStr::Join(tokens, "");
199 }
200 
201 //  ----------------------------------------------------------------------------
202 void
ProcessDataLine(const string & dataLine,string & seqId,string & seqData)203 AlnUtil::ProcessDataLine(
204     const string& dataLine,
205     string& seqId,
206     string& seqData)
207 //  ----------------------------------------------------------------------------
208 {
209     list<string> tokens;
210     NStr::Split(dataLine, " \t", tokens, NStr::fSplit_MergeDelimiters);
211     if (tokens.size() < 2) {
212         throw SShowStopper(
213             -1,
214             eAlnSubcode_IllegalDataLine,
215             "Bad data line: Expected \"<seqId> <data> <offset>\"");
216     }
217     seqId = tokens.front();
218     tokens.pop_front();
219     seqData = NStr::Join(tokens, "");
220 }
221 
222 //  ----------------------------------------------------------------------------
223 void
StripBlanks(const string & line,string & stripped)224 AlnUtil::StripBlanks(
225     const string& line,
226     string& stripped)
227 //  ----------------------------------------------------------------------------
228 {
229     stripped = NStr::TruncateSpaces(line);
230     vector<string> splits;
231     NStr::Split(stripped, " \t", splits, NStr::fSplit_MergeDelimiters);
232     stripped = NStr::Join(splits, "");
233 }
234 
235 END_SCOPE(objects)
236 END_NCBI_SCOPE
237