1 /* $Id: stream_source.cpp 542800 2017-08-02 18:30:15Z lavr $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34 #include <util/file_manifest.hpp>
35 #include <util/stream_source.hpp>
36
37 BEGIN_NCBI_SCOPE
38
39 static set<string> s_InputStreamSourcePrefixes;
40
SetStandardInputArgs(CArgDescriptions & arg_desc,const string & prefix,const string & description,bool is_mandatory)41 void CInputStreamSource::SetStandardInputArgs(CArgDescriptions& arg_desc,
42 const string &prefix,
43 const string &description,
44 bool is_mandatory)
45 {
46 arg_desc.SetCurrentGroup("Input Options for " + prefix);
47 if (prefix == "input") {
48 arg_desc.AddDefaultKey("input", "InputFile",
49 "Stream of " + description,
50 CArgDescriptions::eInputFile,
51 "-");
52 arg_desc.AddAlias("i", "input");
53 } else {
54 if (is_mandatory) {
55 arg_desc.AddKey(prefix, "InputFile",
56 "Stream of " + description,
57 CArgDescriptions::eInputFile);
58 }
59 else {
60 arg_desc.AddOptionalKey(prefix, "InputFile",
61 "Stream of " + description,
62 CArgDescriptions::eInputFile);
63 }
64 }
65
66 arg_desc.AddOptionalKey(prefix + "-path", "InputPath",
67 "Path to " + description,
68 CArgDescriptions::eString);
69 arg_desc.AddOptionalKey(prefix + "-mask", "FileMask",
70 "File pattern to search for " + description,
71 CArgDescriptions::eString);
72 arg_desc.SetDependency(prefix + "-mask",
73 CArgDescriptions::eRequires,
74 prefix + "-path");
75
76 arg_desc.AddOptionalKey(prefix + "-manifest", "InputFile",
77 "File containing a list of files containing " + description,
78 CArgDescriptions::eInputFile);
79
80 arg_desc.SetDependency(prefix,
81 CArgDescriptions::eExcludes,
82 prefix + "-manifest");
83
84 arg_desc.SetDependency(prefix,
85 CArgDescriptions::eExcludes,
86 prefix + "-path");
87
88 arg_desc.SetDependency(prefix + "-manifest",
89 CArgDescriptions::eExcludes,
90 prefix + "-path");
91
92 if (prefix == "input") {
93 arg_desc.AddAlias("I", "input-manifest");
94 }
95
96 s_InputStreamSourcePrefixes.insert(prefix);
97 }
98
RecreateInputArgs(const CArgs & args,const string & prefix)99 vector<string> CInputStreamSource::RecreateInputArgs(const CArgs& args, const string &prefix)
100 {
101 vector<string> result;
102 if (args[prefix + "-path"].HasValue()) {
103 result.push_back("-" + prefix + "-path");
104 result.push_back(args[prefix + "-path"].AsString());
105 if (args[prefix + "-mask"]) {
106 result.push_back("-" + prefix + "-mask");
107 result.push_back(args[prefix + "-mask"].AsString());
108 }
109 }
110 else if (args[prefix + "-manifest"].HasValue()) {
111 result.push_back("-" + prefix + "-manifest");
112 result.push_back(args[prefix + "-manifest"].AsString());
113 }
114 else {
115 result.push_back("-" + prefix);
116 result.push_back(args[prefix].AsString());
117 }
118 return result;
119 }
120
HaveArgsForPrefix(const string & prefix)121 bool CInputStreamSource::HaveArgsForPrefix(const string &prefix)
122 {
123 return s_InputStreamSourcePrefixes.count(prefix) ? true : false;
124 }
125
CInputStreamSource()126 CInputStreamSource::CInputStreamSource()
127 : m_Istr(NULL), m_CurrIndex(0)
128 {
129 }
130
131
CInputStreamSource(const CArgs & args,const string & prefix)132 CInputStreamSource::CInputStreamSource(const CArgs& args, const string& prefix)
133 : m_Istr(NULL), m_CurrIndex(0)
134 {
135 InitArgs(args, prefix);
136 }
137
138
~CInputStreamSource()139 CInputStreamSource::~CInputStreamSource()
140 {
141 }
142
InitArgs(const CArgs & args,const string & prefix)143 void CInputStreamSource::InitArgs(const CArgs& args, const string &prefix)
144 {
145 m_Args.Assign(args);
146 m_Prefix = prefix;
147
148 if (m_Args[prefix + "-path"].HasValue()) {
149 string path = m_Args[prefix + "-path"].AsString();
150 string mask;
151 if (m_Args[prefix + "-mask"]) {
152 mask = m_Args[prefix + "-mask"].AsString();
153 }
154 InitFilesInDirSubtree(path, mask);
155 }
156 else if (m_Args[prefix + "-manifest"].HasValue()) {
157 InitManifest(m_Args[prefix + "-manifest"].AsString());
158 }
159 else if (m_Args[prefix].HasValue() && m_Args[prefix].AsString() == "-") {
160 /// NOTE: this is ignored if either -input-path or -input-mask is
161 /// provided
162 InitStream(m_Args[prefix].AsInputFile(), m_Args[prefix].AsString());
163 }
164 else if (m_Args[prefix].HasValue()) {
165 /// Input file; init as input file, so it can be opened multiple times
166 InitFile(m_Args[prefix].AsString());
167 }
168 }
169
170
171 /// Initialize from a given stream which is the sole content.
172 /// As precondition, expect that the stream is in a good condition
173 /// prior to being handed off to consumers.
174 ///
InitStream(CNcbiIstream & istr,const string & fname)175 void CInputStreamSource::InitStream(CNcbiIstream& istr, const string& fname)
176 {
177 if (m_Istr || m_CurrIndex < m_Files.size()) {
178 NCBI_THROW(CException, eUnknown,
179 "CInputStreamSource::InitManifest(): "
180 "attempt to init already initted class");
181 }
182 if (! istr) {
183 NCBI_THROW(CException, eUnknown,
184 "CInputStreamSource::InitStream(): "
185 "stream is bad");
186 }
187 m_Files.clear();
188 m_Istr = &istr;
189 m_CurrFile = fname;
190 m_CurrIndex = 0;
191 }
192
193
194 /// Initialize from a single file path.
195 ///
InitFile(const string & file_path)196 void CInputStreamSource::InitFile(const string& file_path)
197 {
198 if (m_Istr || m_CurrIndex < m_Files.size()) {
199 NCBI_THROW(CException, eUnknown,
200 "CInputStreamSource::InitFile(): "
201 "attempt to init already initted class");
202 }
203
204 /**
205 * commented out: this breaks stream processing
206 if ( !CFile(file_path).Exists() ) {
207 NCBI_THROW(CException, eUnknown,
208 "input file " + file_path + " does not exist");
209 }
210 **/
211
212 m_Files.clear();
213 m_Files.push_back(file_path);
214 Rewind();
215 }
216
217
218 /// Initialize from a manifest file.
219 ///
220 /// @see CFileManifest
InitManifest(const string & manifest)221 void CInputStreamSource::InitManifest(const string& manifest)
222 {
223 if (m_Istr || m_CurrIndex < m_Files.size()) {
224 NCBI_THROW(CException, eUnknown,
225 "CInputStreamSource::InitManifest(): "
226 "attempt to init already initted class");
227 }
228
229 m_Files.clear();
230 CFileManifest src(manifest);
231 vector<string> all(src.GetAllFilePaths());
232 std::copy( all.begin(), all.end(), std::back_inserter(m_Files));
233
234 _TRACE("Added " << m_Files.size() << " files from input manifest");
235
236 Rewind();
237 }
238
239
240 /// Initialize from a file search path
241 ///
InitFilesInDirSubtree(const string & file_path,const string & file_mask)242 void CInputStreamSource::InitFilesInDirSubtree(const string& file_path,
243 const string& file_mask)
244 {
245 if (m_Istr || m_CurrIndex < m_Files.size()) {
246 NCBI_THROW(CException, eUnknown,
247 "CInputStreamSource::InitFilesInDirSubtree(): "
248 "atemmpt to init already initted class");
249 }
250
251 CDir d(file_path);
252 if ( !d.Exists() ) {
253 NCBI_THROW(CException, eUnknown,
254 "input directory " + file_path + " does not exist");
255 }
256
257 vector<string> paths;
258 paths.push_back(file_path);
259
260 vector<string> masks;
261 if ( !file_mask.empty() ) {
262 masks.push_back(file_mask);
263 } else {
264 masks.push_back("*");
265 }
266
267 m_Files.clear();
268 FindFiles(m_Files,
269 paths.begin(), paths.end(),
270 masks.begin(), masks.end(),
271 fFF_File | fFF_Recursive);
272 _TRACE("Added " << m_Files.size() << " files from input path");
273
274 Rewind();
275 }
276
277
GetStream(string * fname)278 CNcbiIstream& CInputStreamSource::GetStream(string* fname)
279 {
280 if (m_Istr) {
281 if (fname) {
282 *fname = m_CurrFile;
283 }
284 return *m_Istr;
285 }
286
287 if (m_IstrOwned.get()) {
288 if (fname) {
289 *fname = m_CurrFile;
290 }
291 return *m_IstrOwned;
292 }
293
294 NCBI_THROW(CException, eUnknown, "All input streams consumed");
295 }
296
GetStream(void)297 CNcbiIstream& CInputStreamSource::GetStream(void)
298 {
299 if (m_Istr) {
300 return *m_Istr;
301 }
302
303 if (m_IstrOwned.get()) {
304 return *m_IstrOwned;
305 }
306
307 NCBI_THROW(CException, eUnknown, "All input streams consumed");
308 }
309
310
operator *()311 CNcbiIstream& CInputStreamSource::operator*()
312 {
313 return GetStream();
314 }
315
316
operator ++()317 CInputStreamSource& CInputStreamSource::operator++()
318 {
319 // The next stream can be held in either of two places. Clear both.
320
321 // Clear first place.
322 if (m_Istr) {
323 if (m_Istr->bad()) {
324 // Check that the stream, at the end, didn't go bad as might
325 // happen if there was a disk read error. On the other hand,
326 // ok if it has failbit set so ignore that, e.g. getline sets
327 // failbit at the last line, if it has a teminator.
328 NCBI_THROW(CException, eUnknown,
329 "CInputStreamSource::operator++(): "
330 "Unknown error in input stream, "
331 "which is in a bad state after use");
332 }
333 m_Istr = NULL;
334 }
335
336 // Clear second place.
337 if (m_IstrOwned.get()) {
338 if (m_IstrOwned->bad()) {
339 // Samecheck as for m_Istr.
340 string msg("CInputStreamSource::operator++(): "
341 "Unknown error reading file, "
342 "which is in a bad state after use: ");
343 NCBI_THROW(CException, eUnknown, msg + m_CurrFile);
344 }
345 m_IstrOwned.reset();
346 }
347
348 // The current filename currently applies to only the first source,
349 // but someday might apply to others, so clear it here rather than
350 // inside the above conditionals.
351 m_CurrFile.erase();
352
353 // Advance to the next stream, if there is any.
354 if (m_CurrIndex < m_Files.size()) {
355 m_CurrFile = m_Files[m_CurrIndex++];
356 m_IstrOwned.reset(new CNcbiIfstream(m_CurrFile.c_str()));
357 if (m_IstrOwned->fail()) {
358 // Do not provide to clients with streams that are already
359 // known not to be good (fail, meaning badbit or failbit).
360 string msg("CInputStreamSource::operator++(): "
361 "File is not accessible: ");
362 NCBI_THROW(CException, eUnknown, msg + m_CurrFile);
363 }
364 }
365 return *this;
366 }
367
Rewind(void)368 CInputStreamSource& CInputStreamSource::Rewind(void)
369 {
370 m_CurrIndex = 0;
371 ++(*this);
372 return *this;
373 }
374
GetCurrentFileName(void) const375 string CInputStreamSource::GetCurrentFileName(void) const
376 {
377 return m_CurrFile;
378 }
379
GetCurrentStreamIndex(size_t * count) const380 size_t CInputStreamSource::GetCurrentStreamIndex(size_t* count) const
381 {
382 if (count) {
383 *count = m_Files.size();
384 }
385 return m_CurrIndex;
386 }
387
operator bool() const388 CInputStreamSource::operator bool() const
389 {
390 // The stream contains data if it references a stream (given on input)
391 // owns a stream (extracted from a manifest), or still has a non-empty
392 // queued list of files.
393 return (m_Istr || m_IstrOwned.get() || m_CurrIndex < m_Files.size());
394 }
395
396
397
398 END_NCBI_SCOPE
399
400