1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Amelia Fong
27 *
28 */
29 
30 #include <ncbi_pch.hpp>
31 #include <objtools/blast/seqdb_reader/seqidlist_reader.hpp>
32 #include <objects/seqloc/Seq_id.hpp>
33 
34 
35 BEGIN_NCBI_SCOPE
36 USING_SCOPE(objects);
37 
38 
39 class CSeqidlistRead
40 {
41 public:
42 	CSeqidlistRead (CMemoryFile & file);
43 
GetListInfo(SBlastSeqIdListInfo & info)44 	void GetListInfo(SBlastSeqIdListInfo & info){ info = m_info;};
45 	int GetIds(vector<CSeqDBGiList::SSiOid>  & idlist);
46 
47 private:
x_GetUint8()48 	inline Uint8 x_GetUint8() { Uint8 rv= *((Uint8 *) m_Ptr); m_Ptr +=8; return rv;}
x_GetUint4()49 	inline Uint4 x_GetUint4() { Uint4 rv= *((Uint4 *) m_Ptr); m_Ptr +=4; return rv;}
x_GetChar()50 	inline char x_GetChar() {char rv = *m_Ptr; m_Ptr++; return rv;}
x_GetString(string & rv,Uint4 len)51 	inline void x_GetString(string & rv, Uint4 len) {rv.assign (m_Ptr, len); m_Ptr+= len;}
52 
53 	char * m_Ptr;
54 	char * m_EndPtr;
55 	SBlastSeqIdListInfo m_info;
56 };
57 
CSeqidlistRead(CMemoryFile & file)58 CSeqidlistRead::CSeqidlistRead (CMemoryFile & file) : m_Ptr((char*) file.GetPtr()), m_EndPtr((char*) file.GetPtr()) {
59 	if(m_Ptr == NULL) {
60 		NCBI_THROW(CSeqDBException, eArgErr, "Failed to map seqidlist file ");
61 	}
62 
63 	char null_byte = x_GetChar();
64 	if (null_byte == 0) {
65 		m_info.is_v4 = false;
66 		Uint8 file_size = file.GetFileSize();
67 		m_info.file_size = x_GetUint8();
68 		if (m_info.file_size != file_size) {
69 			NCBI_THROW(CSeqDBException, eArgErr, "Invalid seqidlist file");
70 		}
71 		m_EndPtr += file_size;
72 		m_info.num_ids = x_GetUint8();
73 		Uint4 title_length = x_GetUint4();
74 		x_GetString(m_info.title, title_length);
75 		char file_create_date_length = x_GetChar();
76 		x_GetString(m_info.create_date, file_create_date_length);
77 		m_info.db_vol_length = x_GetUint8();
78 		if(m_info.db_vol_length != 0) {
79 			char file_db_create_date_length = x_GetChar();
80 			x_GetString(m_info.db_create_date, file_db_create_date_length);
81 			Uint4 file_vol_names_length = x_GetUint4();
82 			x_GetString(m_info.db_vol_names, file_vol_names_length);
83 		}
84 	}
85 }
86 
GetIds(vector<CSeqDBGiList::SSiOid> & idlist)87 int CSeqidlistRead::GetIds(vector<CSeqDBGiList::SSiOid>  & idlist)
88 {
89 	const unsigned char byte_max = 0xFF;
90 	unsigned int i = 0;
91 	idlist.clear();
92 	idlist.resize(m_info.num_ids);
93 	for(; (m_Ptr < m_EndPtr) && (i < m_info.num_ids); i++) {
94 		unsigned char id_len = (unsigned char) x_GetChar();
95 		if(id_len == byte_max) {
96 			Uint4 long_id_len = x_GetUint4();
97 			x_GetString(idlist[i].si, long_id_len);
98 		}
99 		else {
100 			x_GetString(idlist[i].si, id_len);
101 		}
102 	}
103 	if(i != m_info.num_ids) {
104 		NCBI_THROW(CSeqDBException, eArgErr, "Invalid total num of ids in seqidlist file");
105 	}
106 
107 	return i;
108 }
109 
110 
GetSeqidlist(CMemoryFile & file,vector<CSeqDBGiList::SSiOid> & idlist,SBlastSeqIdListInfo & list_info)111 int CBlastSeqidlistFile::GetSeqidlist(CMemoryFile & file, vector<CSeqDBGiList::SSiOid>  & idlist,
112 		                              SBlastSeqIdListInfo & list_info)
113 {
114 
115 	CSeqidlistRead list(file);
116 	list.GetListInfo(list_info);
117 	list.GetIds(idlist);
118 
119 	return list_info.num_ids;
120 }
121 
GetSeqidlistInfo(const string & filename,SBlastSeqIdListInfo & list_info)122 int CBlastSeqidlistFile::GetSeqidlistInfo(const string & filename, SBlastSeqIdListInfo & list_info)
123 {
124 	string file = SeqDB_ResolveDbPath(filename);
125 	CMemoryFile in(file);
126 	CSeqidlistRead list(in);
127 	list.GetListInfo(list_info);
128 	return list_info.num_ids;
129 
130 }
131 
PrintSeqidlistInfo(const string & filename,CNcbiOstream & os)132 void CBlastSeqidlistFile::PrintSeqidlistInfo(const string & filename, CNcbiOstream & os)
133 {
134 	SBlastSeqIdListInfo list_info;
135 	if (CBlastSeqidlistFile::GetSeqidlistInfo(filename, list_info) > 0) {
136 		os <<"Num of Ids: " <<  list_info.num_ids << "\n";
137 		os <<"Title: " << list_info.title << "\n";
138 		os <<"Create Date: " << list_info.create_date << "\n";
139 		if(list_info.db_vol_length > 0) {
140 			os << "DB Info: \n";
141 			os << "\t" << "Total Vol Length: " << list_info.db_vol_length  << "\n";
142 			os << "\t" << "DB Create Date: " << list_info.db_create_date  << "\n";
143 			os << "\t" << "DB Vols: ";
144 			vector<string> vols;
145 			NStr::Split(list_info.db_vol_names, " ", vols);
146 			for(unsigned int i=0; i < vols.size(); i ++ ) {
147 				os << "\n\t\t" << vols[i];
148 			}
149 		}
150 	}
151 	else {
152 		os << "Seqidlist file is not in blast db version 5 format";
153 	}
154 	os << endl;
155 }
156 
157 END_NCBI_SCOPE
158