1 #include <string>
2 #include <iostream>
3 #include <sstream>
4 #include <fstream>
5 #include <vector>
6 #include <unordered_set>
7 #include <numeric>
8 #include <stdint.h>
9 #include <regex>
10
11 #include "KmerIndex.h"
12 #include "common.h"
13
MergeBatchDirectories(const ProgramOptions & opt,int & num_targets,int64_t & num_processed,int64_t & num_pseudoaligned,int64_t & num_unique,int & index_version)14 bool MergeBatchDirectories(const ProgramOptions &opt, int& num_targets, int64_t& num_processed,
15 int64_t& num_pseudoaligned, int64_t& num_unique, int& index_version) {
16 int ndirs = opt.files.size();
17 std::vector<int> numCells;
18 std::unordered_set<std::string> cellSet;
19 std::vector<std::string> cells;
20
21 // stats to collect
22 num_targets = -1;
23 num_processed = 0;
24 num_pseudoaligned = 0;
25 num_unique = 0;
26 index_version = -1;
27
28 // get stat info from run_info.json
29 for (const auto &fn : opt.files) {
30 std::ifstream jfile(fn + "/run_info.json");
31 std::ostringstream ss;
32 ss << jfile.rdbuf();
33 std::string s = ss.str();
34 jfile.close();
35
36 auto match_int = [&](std::string name, int64_t& val ) -> bool {
37 std::string re_s = "\"" + name + "\": (\\d+),";
38 std::regex re(re_s);
39 std::smatch m;
40 if (std::regex_search(s, m, re)) {
41 if (m.empty() || m.size() <= 1) {
42 return false;
43 }
44 val = std::stoi(m[1]);
45 return true;
46 } else {
47 return false;
48 }
49 };
50
51 int64_t tmp = -1;
52 if (match_int("n_processed", tmp)) {
53 num_processed += tmp;
54 }
55 if (match_int("n_pseudoaligned", tmp)) {
56 num_pseudoaligned += tmp;
57 }
58 if (match_int("n_unique", tmp)) {
59 num_unique += tmp;
60 }
61 if (index_version == -1 && match_int("index_version",tmp)) {
62 index_version = (int) tmp;
63 }
64 if (num_targets == -1 && match_int("n_targets", tmp)) {
65 num_targets = (int) tmp;
66 }
67
68 }
69
70 // merge cell lists
71 for (const auto &fn : opt.files) {
72 std::ifstream cfile((fn + "/matrix.cells"));
73
74 std::string line;
75 std::string id,f1,f2;
76 int num = 0;
77 while (std::getline(cfile,line)) {
78 if (line.size() == 0) {
79 continue;
80 }
81 std::stringstream ss(line);
82 ss >> id;
83
84 if (cellSet.find(id) == cellSet.end()) {
85 cells.push_back(id);
86 cellSet.insert(id);
87 num++;
88 } else {
89 std::cerr << "Error: cell id \"" << id << "\" repeated in file " << fn << "/matrix.cells" << std::endl;
90 return false;
91 }
92 }
93 numCells.push_back(num);
94 }
95
96 std::ofstream cfile((opt.output + "/matrix.cells"));
97 for (const auto &id : cells) {
98 cfile << id << "\n";
99 }
100 cfile.close();
101
102 // merge ec lists
103 KmerIndex index(opt);
104 std::vector<std::vector<int>> ecTrans;
105 std::vector<int> numEcs;
106
107 for (int i = 0; i < ndirs; i++) {
108 std::vector<int> ctrans;
109 std::ifstream ecFile(opt.files[i] + "/matrix.ec");
110
111 std::string line,t;
112 std::vector<int> c;
113 while (std::getline(ecFile,line)) {
114 c.clear();
115 int ec = -1;
116 if (line.size() == 0) {
117 continue;
118 }
119 std::stringstream ss(line);
120 ss >> ec;
121 while (std::getline(ss, t, ',')) {
122 c.push_back(std::stoi(t));
123 }
124
125 assert(ctrans.size() == ec);
126
127 int index_ec = -1;
128 auto search = index.ecmapinv.find(c);
129 if (search != index.ecmapinv.end()) {
130 index_ec = search->second;
131 } else {
132 index_ec = index.ecmap.size();
133 index.ecmap.push_back(c);
134 index.ecmapinv.insert({c,index_ec});
135 }
136 ctrans.push_back(index_ec);
137 }
138 numEcs.push_back(ctrans.size());
139 ecTrans.push_back(std::move(ctrans));
140 }
141
142 //debug
143 /*
144 for (auto &ct : ecTrans) {
145 std::cout << "-- " << std::endl;
146 for (int i = 0; i < ct.size(); i++) {
147 std::cout << i << "\t" << ct[i] << std::endl;
148 }
149 }
150 */
151
152 std::ofstream ecFile(opt.output + "/matrix.ec");
153 for (int j = 0; j < index.ecmap.size(); j++) {
154 ecFile << j << "\t";
155 // output the rest of the class
156 const auto &v = index.ecmap[j];
157 bool first = true;
158 for (auto x : v) {
159 if (!first) {
160 ecFile << ",";
161 } else {
162 first = false;
163 }
164 ecFile << x;
165 }
166 ecFile << "\n";
167 }
168 ecFile.close();
169
170 // get stats
171 std::vector<int64_t> numNonzero;
172 for (int i = 0; i < ndirs; i++) {
173 std::ifstream mFile(opt.files[i] + "/matrix.tcc.mtx");
174 std::string line;
175 std::getline(mFile, line);
176 assert(!line.empty() && line[0] =='%');
177 int ncell,nec;
178 int64_t nz;
179 mFile >> ncell >> nec >> nz;
180 mFile.close();
181
182 if (ncell != numCells[i]) {
183 std::cerr << "Error: number of cells in mtx and cell file do not match in directory " << opt.files[i] << std::endl;
184 return false;
185 }
186
187 if (nec != numEcs[i]) {
188 std::cerr << "Error: number of equivalence classes in mtx and cell file do not match in directory " << opt.files[i] << std::endl;
189 return false;
190 }
191 numNonzero.push_back(nz);
192 }
193
194
195 // merge into single file
196 std::ofstream omFile(opt.output + "/matrix.tcc.mtx");
197 omFile << "%%MatrixMarket matrix coordinate real general\n";
198 int sNumCells = std::accumulate(numCells.begin(), numCells.end(), 0);
199 int sNumEcs = index.ecmap.size();
200 int64_t sNz = std::accumulate(numNonzero.begin(), numNonzero.end(), 0);
201 omFile << sNumCells << "\t" << sNumEcs << "\t" << sNz << "\n";
202 int64_t nz = 0;
203 int cellOffset = 0;
204 for (int i = 0; i < ndirs; i++) {
205 // merge all files
206 std::ifstream mFile(opt.files[i] + "/matrix.tcc.mtx");
207 std::string line;
208 std::getline(mFile, line); // mtx format
209 std::getline(mFile, line); // specification
210 if (i > 0) {
211 cellOffset += numCells[i-1];
212 }
213
214 int local_cellid, local_ec, count;
215 while (mFile >> local_cellid >> local_ec >> count) {
216 local_ec -= 1;
217 int ec = ecTrans[i][local_ec];
218 int cellid = cellOffset + local_cellid;
219 nz++;
220 omFile << cellid << "\t" << (ec+1) << "\t" << count << "\n";
221 }
222 }
223
224 if (nz != sNz) {
225 std::cerr << "Warning: number of matrix entries inconsistent with header" << std::endl;
226 }
227 omFile.close();
228
229
230
231 return true;
232 }
233
234
235
236