1 #include <string>
2 #include <iostream>
3 #include <sstream>
4 #include <fstream>
5 #include <vector>
6 #include <unordered_set>
7 #include <numeric>
8 #include <stdint.h>
9 #include <regex>
10 
11 #include "KmerIndex.h"
12 #include "common.h"
13 
MergeBatchDirectories(const ProgramOptions & opt,int & num_targets,int64_t & num_processed,int64_t & num_pseudoaligned,int64_t & num_unique,int & index_version)14 bool MergeBatchDirectories(const ProgramOptions &opt, int& num_targets, int64_t& num_processed,
15                            int64_t& num_pseudoaligned, int64_t&  num_unique, int& index_version) {
16   int ndirs = opt.files.size();
17   std::vector<int> numCells;
18   std::unordered_set<std::string> cellSet;
19   std::vector<std::string> cells;
20 
21   // stats to collect
22   num_targets = -1;
23   num_processed = 0;
24   num_pseudoaligned = 0;
25   num_unique = 0;
26   index_version = -1;
27 
28   // get stat info from run_info.json
29   for (const auto &fn : opt.files) {
30     std::ifstream jfile(fn + "/run_info.json");
31     std::ostringstream ss;
32     ss << jfile.rdbuf();
33     std::string s = ss.str();
34     jfile.close();
35 
36     auto match_int = [&](std::string name, int64_t& val ) -> bool {
37       std::string re_s = "\"" + name + "\": (\\d+),";
38       std::regex re(re_s);
39       std::smatch m;
40       if (std::regex_search(s, m, re)) {
41         if (m.empty() || m.size() <= 1) {
42           return false;
43         }
44         val = std::stoi(m[1]);
45         return true;
46       } else {
47         return false;
48       }
49     };
50 
51     int64_t tmp = -1;
52     if (match_int("n_processed", tmp)) {
53       num_processed += tmp;
54     }
55     if (match_int("n_pseudoaligned", tmp)) {
56       num_pseudoaligned += tmp;
57     }
58     if (match_int("n_unique", tmp)) {
59       num_unique += tmp;
60     }
61     if (index_version == -1 && match_int("index_version",tmp)) {
62       index_version = (int) tmp;
63     }
64     if (num_targets == -1 && match_int("n_targets", tmp)) {
65       num_targets = (int) tmp;
66     }
67 
68   }
69 
70   // merge cell lists
71   for (const auto &fn : opt.files) {
72     std::ifstream cfile((fn + "/matrix.cells"));
73 
74     std::string line;
75     std::string id,f1,f2;
76     int num = 0;
77     while (std::getline(cfile,line)) {
78       if (line.size() == 0) {
79         continue;
80       }
81       std::stringstream ss(line);
82       ss >> id;
83 
84       if (cellSet.find(id) == cellSet.end()) {
85         cells.push_back(id);
86         cellSet.insert(id);
87         num++;
88       } else {
89         std::cerr << "Error: cell id \"" << id << "\" repeated in file " << fn << "/matrix.cells" << std::endl;
90         return false;
91       }
92     }
93     numCells.push_back(num);
94   }
95 
96   std::ofstream cfile((opt.output + "/matrix.cells"));
97   for (const auto &id : cells) {
98     cfile << id << "\n";
99   }
100   cfile.close();
101 
102   // merge ec lists
103   KmerIndex index(opt);
104   std::vector<std::vector<int>> ecTrans;
105   std::vector<int> numEcs;
106 
107   for (int i = 0; i < ndirs; i++) {
108     std::vector<int> ctrans;
109     std::ifstream ecFile(opt.files[i] + "/matrix.ec");
110 
111     std::string line,t;
112     std::vector<int> c;
113     while (std::getline(ecFile,line)) {
114       c.clear();
115       int ec = -1;
116       if (line.size() == 0) {
117         continue;
118       }
119       std::stringstream ss(line);
120       ss >> ec;
121       while (std::getline(ss, t, ',')) {
122         c.push_back(std::stoi(t));
123       }
124 
125       assert(ctrans.size() == ec);
126 
127       int index_ec = -1;
128       auto search = index.ecmapinv.find(c);
129       if (search != index.ecmapinv.end()) {
130         index_ec = search->second;
131       } else {
132          index_ec = index.ecmap.size();
133          index.ecmap.push_back(c);
134          index.ecmapinv.insert({c,index_ec});
135       }
136       ctrans.push_back(index_ec);
137     }
138     numEcs.push_back(ctrans.size());
139     ecTrans.push_back(std::move(ctrans));
140   }
141 
142   //debug
143   /*
144   for (auto &ct : ecTrans) {
145     std::cout << "-- " << std::endl;
146     for (int i = 0; i < ct.size(); i++) {
147       std::cout << i << "\t" << ct[i] << std::endl;
148     }
149   }
150   */
151 
152   std::ofstream ecFile(opt.output + "/matrix.ec");
153   for (int j = 0; j < index.ecmap.size(); j++) {
154     ecFile << j << "\t";
155     // output the rest of the class
156     const auto &v = index.ecmap[j];
157     bool first = true;
158     for (auto x : v) {
159       if (!first) {
160         ecFile << ",";
161       } else {
162         first = false;
163       }
164       ecFile << x;
165     }
166     ecFile << "\n";
167   }
168   ecFile.close();
169 
170   // get stats
171   std::vector<int64_t> numNonzero;
172   for (int i = 0; i < ndirs; i++) {
173     std::ifstream mFile(opt.files[i] + "/matrix.tcc.mtx");
174     std::string line;
175     std::getline(mFile, line);
176     assert(!line.empty() && line[0] =='%');
177     int ncell,nec;
178     int64_t nz;
179     mFile >> ncell >> nec >> nz;
180     mFile.close();
181 
182     if (ncell != numCells[i]) {
183       std::cerr << "Error: number of cells in mtx and cell file do not match in directory  " << opt.files[i] << std::endl;
184       return false;
185     }
186 
187     if (nec != numEcs[i]) {
188       std::cerr << "Error: number of equivalence classes in mtx and cell file do not match in directory  " << opt.files[i] << std::endl;
189       return false;
190     }
191     numNonzero.push_back(nz);
192   }
193 
194 
195   // merge into single file
196   std::ofstream omFile(opt.output + "/matrix.tcc.mtx");
197   omFile << "%%MatrixMarket matrix coordinate real general\n";
198   int sNumCells = std::accumulate(numCells.begin(), numCells.end(), 0);
199   int sNumEcs = index.ecmap.size();
200   int64_t sNz = std::accumulate(numNonzero.begin(), numNonzero.end(), 0);
201   omFile << sNumCells << "\t" << sNumEcs << "\t" << sNz << "\n";
202   int64_t nz = 0;
203   int cellOffset = 0;
204   for (int i = 0; i < ndirs; i++) {
205     // merge all files
206     std::ifstream mFile(opt.files[i] + "/matrix.tcc.mtx");
207     std::string line;
208     std::getline(mFile, line); // mtx format
209     std::getline(mFile, line); // specification
210     if (i > 0) {
211       cellOffset += numCells[i-1];
212     }
213 
214     int local_cellid, local_ec, count;
215     while (mFile >> local_cellid >> local_ec >> count) {
216       local_ec -= 1;
217       int ec = ecTrans[i][local_ec];
218       int cellid = cellOffset + local_cellid;
219       nz++;
220       omFile << cellid << "\t" << (ec+1) << "\t" << count << "\n";
221     }
222   }
223 
224   if (nz != sNz) {
225     std::cerr << "Warning: number of matrix entries inconsistent with header" << std::endl;
226   }
227   omFile.close();
228 
229 
230 
231   return true;
232 }
233 
234 
235 
236