1 /*  $Id: stream_source.cpp 542800 2017-08-02 18:30:15Z lavr $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34 #include <util/file_manifest.hpp>
35 #include <util/stream_source.hpp>
36 
37 BEGIN_NCBI_SCOPE
38 
39 static set<string> s_InputStreamSourcePrefixes;
40 
SetStandardInputArgs(CArgDescriptions & arg_desc,const string & prefix,const string & description,bool is_mandatory)41 void CInputStreamSource::SetStandardInputArgs(CArgDescriptions& arg_desc,
42                                               const string &prefix,
43                                               const string &description,
44                                               bool is_mandatory)
45 {
46     arg_desc.SetCurrentGroup("Input Options for " + prefix);
47     if (prefix == "input") {
48         arg_desc.AddDefaultKey("input", "InputFile",
49                                "Stream of " + description,
50                                CArgDescriptions::eInputFile,
51                                "-");
52         arg_desc.AddAlias("i", "input");
53     } else {
54         if (is_mandatory) {
55             arg_desc.AddKey(prefix, "InputFile",
56                             "Stream of " + description,
57                             CArgDescriptions::eInputFile);
58         }
59         else {
60             arg_desc.AddOptionalKey(prefix, "InputFile",
61                                     "Stream of " + description,
62                                     CArgDescriptions::eInputFile);
63         }
64     }
65 
66     arg_desc.AddOptionalKey(prefix + "-path", "InputPath",
67                             "Path to " + description,
68                             CArgDescriptions::eString);
69     arg_desc.AddOptionalKey(prefix + "-mask", "FileMask",
70                             "File pattern to search for " + description,
71                             CArgDescriptions::eString);
72     arg_desc.SetDependency(prefix + "-mask",
73                            CArgDescriptions::eRequires,
74                            prefix + "-path");
75 
76     arg_desc.AddOptionalKey(prefix + "-manifest", "InputFile",
77                             "File containing a list of files containing " + description,
78                             CArgDescriptions::eInputFile);
79 
80     arg_desc.SetDependency(prefix,
81                            CArgDescriptions::eExcludes,
82                            prefix + "-manifest");
83 
84     arg_desc.SetDependency(prefix,
85                            CArgDescriptions::eExcludes,
86                            prefix + "-path");
87 
88     arg_desc.SetDependency(prefix + "-manifest",
89                            CArgDescriptions::eExcludes,
90                            prefix + "-path");
91 
92     if (prefix == "input") {
93         arg_desc.AddAlias("I", "input-manifest");
94     }
95 
96     s_InputStreamSourcePrefixes.insert(prefix);
97 }
98 
RecreateInputArgs(const CArgs & args,const string & prefix)99 vector<string> CInputStreamSource::RecreateInputArgs(const CArgs& args, const string &prefix)
100 {
101     vector<string> result;
102     if (args[prefix + "-path"].HasValue()) {
103         result.push_back("-" + prefix + "-path");
104         result.push_back(args[prefix + "-path"].AsString());
105         if (args[prefix + "-mask"]) {
106             result.push_back("-" + prefix + "-mask");
107             result.push_back(args[prefix + "-mask"].AsString());
108         }
109     }
110     else if (args[prefix + "-manifest"].HasValue()) {
111         result.push_back("-" + prefix + "-manifest");
112         result.push_back(args[prefix + "-manifest"].AsString());
113     }
114     else {
115         result.push_back("-" + prefix);
116         result.push_back(args[prefix].AsString());
117     }
118     return result;
119 }
120 
HaveArgsForPrefix(const string & prefix)121 bool CInputStreamSource::HaveArgsForPrefix(const string &prefix)
122 {
123     return s_InputStreamSourcePrefixes.count(prefix) ? true : false;
124 }
125 
CInputStreamSource()126 CInputStreamSource::CInputStreamSource()
127     : m_Istr(NULL), m_CurrIndex(0)
128 {
129 }
130 
131 
CInputStreamSource(const CArgs & args,const string & prefix)132 CInputStreamSource::CInputStreamSource(const CArgs& args, const string& prefix)
133     : m_Istr(NULL), m_CurrIndex(0)
134 {
135     InitArgs(args, prefix);
136 }
137 
138 
~CInputStreamSource()139 CInputStreamSource::~CInputStreamSource()
140 {
141 }
142 
InitArgs(const CArgs & args,const string & prefix)143 void CInputStreamSource::InitArgs(const CArgs& args, const string &prefix)
144 {
145     m_Args.Assign(args);
146     m_Prefix = prefix;
147 
148     if (m_Args[prefix + "-path"].HasValue()) {
149         string path = m_Args[prefix + "-path"].AsString();
150         string mask;
151         if (m_Args[prefix + "-mask"]) {
152             mask = m_Args[prefix + "-mask"].AsString();
153         }
154         InitFilesInDirSubtree(path, mask);
155     }
156     else if (m_Args[prefix + "-manifest"].HasValue()) {
157         InitManifest(m_Args[prefix + "-manifest"].AsString());
158     }
159     else if (m_Args[prefix].HasValue() && m_Args[prefix].AsString() == "-") {
160         /// NOTE: this is ignored if either -input-path or -input-mask is
161         /// provided
162         InitStream(m_Args[prefix].AsInputFile(), m_Args[prefix].AsString());
163     }
164     else if (m_Args[prefix].HasValue()) {
165         /// Input file; init as input file, so it can be opened multiple times
166         InitFile(m_Args[prefix].AsString());
167     }
168 }
169 
170 
171 /// Initialize from a given stream which is the sole content.
172 /// As precondition, expect that the stream is in a good condition
173 /// prior to being handed off to consumers.
174 ///
InitStream(CNcbiIstream & istr,const string & fname)175 void CInputStreamSource::InitStream(CNcbiIstream& istr, const string& fname)
176 {
177     if (m_Istr  ||  m_CurrIndex < m_Files.size()) {
178         NCBI_THROW(CException, eUnknown,
179                    "CInputStreamSource::InitManifest(): "
180                    "attempt to init already initted class");
181     }
182     if (! istr) {
183         NCBI_THROW(CException, eUnknown,
184                    "CInputStreamSource::InitStream(): "
185                    "stream is bad");
186     }
187     m_Files.clear();
188     m_Istr = &istr;
189     m_CurrFile = fname;
190     m_CurrIndex = 0;
191 }
192 
193 
194 /// Initialize from a single file path.
195 ///
InitFile(const string & file_path)196 void CInputStreamSource::InitFile(const string& file_path)
197 {
198     if (m_Istr  ||  m_CurrIndex < m_Files.size()) {
199         NCBI_THROW(CException, eUnknown,
200                    "CInputStreamSource::InitFile(): "
201                    "attempt to init already initted class");
202     }
203 
204     /**
205      * commented out: this breaks stream processing
206     if ( !CFile(file_path).Exists() ) {
207         NCBI_THROW(CException, eUnknown,
208                    "input file " + file_path + " does not exist");
209     }
210     **/
211 
212     m_Files.clear();
213     m_Files.push_back(file_path);
214     Rewind();
215 }
216 
217 
218 /// Initialize from a manifest file.
219 ///
220 /// @see CFileManifest
InitManifest(const string & manifest)221 void CInputStreamSource::InitManifest(const string& manifest)
222 {
223     if (m_Istr  || m_CurrIndex < m_Files.size()) {
224         NCBI_THROW(CException, eUnknown,
225                    "CInputStreamSource::InitManifest(): "
226                    "attempt to init already initted class");
227     }
228 
229     m_Files.clear();
230     CFileManifest src(manifest);
231     vector<string> all(src.GetAllFilePaths());
232     std::copy( all.begin(), all.end(), std::back_inserter(m_Files));
233 
234     _TRACE("Added " << m_Files.size() << " files from input manifest");
235 
236     Rewind();
237 }
238 
239 
240 /// Initialize from a file search path
241 ///
InitFilesInDirSubtree(const string & file_path,const string & file_mask)242 void CInputStreamSource::InitFilesInDirSubtree(const string& file_path,
243                                                const string& file_mask)
244 {
245     if (m_Istr  ||  m_CurrIndex < m_Files.size()) {
246         NCBI_THROW(CException, eUnknown,
247                    "CInputStreamSource::InitFilesInDirSubtree(): "
248                    "atemmpt to init already initted class");
249     }
250 
251     CDir d(file_path);
252     if ( !d.Exists() ) {
253         NCBI_THROW(CException, eUnknown,
254                    "input directory " + file_path + " does not exist");
255     }
256 
257     vector<string> paths;
258     paths.push_back(file_path);
259 
260     vector<string> masks;
261     if ( !file_mask.empty() ) {
262         masks.push_back(file_mask);
263     } else {
264         masks.push_back("*");
265     }
266 
267     m_Files.clear();
268     FindFiles(m_Files,
269               paths.begin(), paths.end(),
270               masks.begin(), masks.end(),
271               fFF_File | fFF_Recursive);
272     _TRACE("Added " << m_Files.size() << " files from input path");
273 
274     Rewind();
275 }
276 
277 
GetStream(string * fname)278 CNcbiIstream& CInputStreamSource::GetStream(string* fname)
279 {
280     if (m_Istr) {
281         if (fname) {
282             *fname = m_CurrFile;
283         }
284         return *m_Istr;
285     }
286 
287     if (m_IstrOwned.get()) {
288         if (fname) {
289             *fname = m_CurrFile;
290         }
291         return *m_IstrOwned;
292     }
293 
294     NCBI_THROW(CException, eUnknown, "All input streams consumed");
295 }
296 
GetStream(void)297 CNcbiIstream& CInputStreamSource::GetStream(void)
298 {
299     if (m_Istr) {
300         return *m_Istr;
301     }
302 
303     if (m_IstrOwned.get()) {
304         return *m_IstrOwned;
305     }
306 
307     NCBI_THROW(CException, eUnknown, "All input streams consumed");
308 }
309 
310 
operator *()311 CNcbiIstream& CInputStreamSource::operator*()
312 {
313     return GetStream();
314 }
315 
316 
operator ++()317 CInputStreamSource& CInputStreamSource::operator++()
318 {
319     // The next stream can be held in either of two places. Clear both.
320 
321     // Clear first place.
322     if (m_Istr) {
323         if (m_Istr->bad()) {
324             // Check that the stream, at the end, didn't go bad as might
325             // happen if there was a disk read error. On the other hand,
326             // ok if it has failbit set so ignore that, e.g. getline sets
327             // failbit at the last line, if it has a teminator.
328             NCBI_THROW(CException, eUnknown,
329                        "CInputStreamSource::operator++(): "
330                        "Unknown error in input stream, "
331                        "which is in a bad state after use");
332         }
333         m_Istr = NULL;
334     }
335 
336     // Clear second place.
337     if (m_IstrOwned.get()) {
338         if (m_IstrOwned->bad()) {
339             // Samecheck  as for m_Istr.
340             string msg("CInputStreamSource::operator++(): "
341                        "Unknown error reading file, "
342                        "which is in a bad state after use: ");
343             NCBI_THROW(CException, eUnknown, msg + m_CurrFile);
344         }
345         m_IstrOwned.reset();
346     }
347 
348     // The current filename currently applies to only the first source,
349     // but someday might apply to others, so clear it here rather than
350     // inside the above conditionals.
351     m_CurrFile.erase();
352 
353     // Advance to the next stream, if there is any.
354     if (m_CurrIndex < m_Files.size()) {
355         m_CurrFile = m_Files[m_CurrIndex++];
356         m_IstrOwned.reset(new CNcbiIfstream(m_CurrFile.c_str()));
357         if (m_IstrOwned->fail()) {
358             // Do not provide to clients with streams that are already
359             // known not to be good (fail, meaning badbit or failbit).
360             string msg("CInputStreamSource::operator++(): "
361                        "File is not accessible: ");
362             NCBI_THROW(CException, eUnknown, msg + m_CurrFile);
363         }
364     }
365     return *this;
366 }
367 
Rewind(void)368 CInputStreamSource& CInputStreamSource::Rewind(void)
369 {
370     m_CurrIndex = 0;
371     ++(*this);
372     return *this;
373 }
374 
GetCurrentFileName(void) const375 string CInputStreamSource::GetCurrentFileName(void) const
376 {
377     return m_CurrFile;
378 }
379 
GetCurrentStreamIndex(size_t * count) const380 size_t CInputStreamSource::GetCurrentStreamIndex(size_t* count) const
381 {
382     if (count) {
383         *count = m_Files.size();
384     }
385     return m_CurrIndex;
386 }
387 
operator bool() const388 CInputStreamSource::operator bool() const
389 {
390     // The stream contains data if it references a stream (given on input)
391     // owns a stream (extracted from a manifest), or still has a non-empty
392     // queued list of files.
393     return (m_Istr  ||  m_IstrOwned.get()  ||  m_CurrIndex < m_Files.size());
394 }
395 
396 
397 
398 END_NCBI_SCOPE
399 
400