1 //![include]
2 #include <iostream>
3 #include <seqan/seq_io.h>
4 #include <seqan/journaled_set.h>
5
6 using namespace seqan;
7 //![include]
8
9 //![searchAtBorder]
10 template <typename TJournalEntriesIterator, typename TJournal, typename TPattern>
_searchAtBorder(String<int> & hitTarget,TJournalEntriesIterator & entriesIt,TJournal const & journal,TPattern const & pattern)11 void _searchAtBorder(String<int> & hitTarget,
12 TJournalEntriesIterator & entriesIt,
13 TJournal const & journal,
14 TPattern const & pattern)
15 {
16 typedef typename Iterator<TJournal const, Standard>::Type TJournalIterator;
17
18 // Define region before the border to the next node to search for the pattern.
19 TJournalIterator nodeIter = iter(journal, entriesIt->virtualPosition +
20 _max(0, (int) entriesIt->length - (int) length(pattern) + 1));
21 // Define end of search region.
22 TJournalIterator nodeEnd = iter(journal, _min(entriesIt->virtualPosition + entriesIt->length, length(journal) -
23 length(pattern) + 1));
24 // Move step by step over search region.
25 if (nodeEnd == end(journal))
26 return;
27
28 for (; nodeIter != nodeEnd; ++nodeIter)
29 {
30 // Define compare iterator.
31 TJournalIterator verifyIter = nodeIter;
32 bool isHit = true;
33 // Compare pattern with current search window.
34 for (unsigned posPattern = 0; posPattern < length(pattern); ++posPattern, ++verifyIter)
35 {
36 // Comparing the pattern value with the current value of the iterator.
37 if (pattern[posPattern] != getValue(verifyIter))
38 {
39 isHit = false;
40 break;
41 }
42 }
43 // Report hit if found.
44 if (isHit)
45 appendValue(hitTarget, position(nodeIter));
46 }
47 }
48 //![searchAtBorder]
49
50 //![findInPatchNodePart1]
51 template <typename TJournalEntriesIterator, typename TJournal, typename TPattern>
_findInPatchNode(String<int> & hitTarget,TJournalEntriesIterator & entriesIt,TJournal const & journal,TPattern const & pattern)52 void _findInPatchNode(String<int> & hitTarget,
53 TJournalEntriesIterator & entriesIt,
54 TJournal const & journal,
55 TPattern const & pattern)
56 {
57 typedef typename Iterator<TJournal const, Standard>::Type TJournalIterator;
58 //![findInPatchNodePart1]
59
60 //![findInPatchNodePart2]
61 // Search for pattern in the insertion node.
62 TJournalIterator patchIter = iter(journal, entriesIt->virtualPosition);
63 TJournalIterator patchEnd = patchIter + _max(0, (int)entriesIt->length - (int)length(pattern) + 1);
64 // Move step by step over search region.
65 for (; patchIter != patchEnd; ++patchIter)
66 {
67 //![findInPatchNodePart2]
68 //![findInPatchNodePart3]
69 TJournalIterator verifyIter = patchIter;
70 bool isHit = true;
71 // Search for pattern in the insertion node.
72 for (unsigned posPattern = 0; posPattern < length(pattern); ++posPattern, ++verifyIter)
73 {
74 // Comparing the pattern value with the current value of the iterator.
75 if (pattern[posPattern] != getValue(verifyIter))
76 {
77 isHit = false;
78 break;
79 }
80 }
81 if (isHit)
82 appendValue(hitTarget, position(patchIter));
83 }
84 }
85 //![findInPatchNodePart3]
86
87 //![findInOriginalNode]
88 template <typename TJournalEntriesIterator, typename TPattern>
_findInOriginalNode(String<int> & hitTarget,TJournalEntriesIterator & entriesIt,TPattern const & pattern,String<int> const & refHits)89 void _findInOriginalNode(String<int> & hitTarget,
90 TJournalEntriesIterator & entriesIt,
91 TPattern const & pattern,
92 String<int> const & refHits)
93 {
94 // Define an Iterator which iterates over the reference hit set.
95 typedef typename Iterator<String<int> const, Standard>::Type THitIterator;
96
97 // Check if hits exist in the reference.
98 if (!empty(refHits))
99 {
100 // Find upper bound to physical position in sorted refHits.
101 THitIterator itHit = std::upper_bound(begin(refHits), end(refHits), (int)entriesIt->physicalPosition);
102 // Make sure we do not miss hits that begin at physical position of current node.
103 if (itHit != begin(refHits) && *(itHit - 1) >= (int)entriesIt->physicalPosition)
104 --itHit;
105 // Store all hits that are found in the region of the reference which is covered by this node.
106 while ((int)*itHit < ((int)entriesIt->physicalPosition + (int)entriesIt->length - (int)length(pattern) + 1) && itHit != end(refHits))
107 {
108 appendValue(hitTarget, entriesIt->virtualPosition + (*itHit - (int)entriesIt->physicalPosition));
109 ++itHit;
110 }
111 }
112 }
113 //![findInOriginalNode]
114
115 //![findPatternInJournalStringPart1]
116 template <typename TValue, typename THostSpec, typename TJournalSpec, typename TBufferSpec, typename TPattern>
findPatternInJournalString(String<int> & hitTarget,String<TValue,Journaled<THostSpec,TJournalSpec,TBufferSpec>> const & journal,TPattern const & pattern,String<int> const & refHits)117 void findPatternInJournalString(String<int> & hitTarget,
118 String<TValue, Journaled<THostSpec, TJournalSpec, TBufferSpec> > const & journal,
119 TPattern const & pattern,
120 String<int> const & refHits)
121 {
122 typedef String<TValue, Journaled<THostSpec, TJournalSpec, TBufferSpec> > const TJournal;
123 typedef typename JournalType<TJournal>::Type TJournalEntries;
124 typedef typename Iterator<TJournalEntries>::Type TJournalEntriesIterator;
125
126 if (length(pattern) > length(journal))
127 return;
128 //![findPatternInJournalStringPart1]
129
130 //![findPatternInJournalStringPart2]
131 TJournalEntriesIterator it = begin(journal._journalEntries);
132 TJournalEntriesIterator itEnd = findInJournalEntries(journal._journalEntries, length(journal) - length(pattern) + 1) + 1;
133 //![findPatternInJournalStringPart2]
134
135 //![findPatternInJournalStringPart3]
136 while (it != itEnd)
137 {
138 if (it->segmentSource == SOURCE_ORIGINAL) // Find a possible hit in the current source vertex.
139 {
140 _findInOriginalNode(hitTarget, it, pattern, refHits);
141 }
142 if (it->segmentSource == SOURCE_PATCH) // Search for pattern within the patch node.
143 {
144 _findInPatchNode(hitTarget, it, journal, pattern);
145 }
146 // Scan the border for a possible match.
147 _searchAtBorder(hitTarget, it, journal, pattern);
148 ++it;
149 }
150 }
151 //![findPatternInJournalStringPart3]
152
153 //![findPatternInReference]
154 template <typename TString, typename TPattern>
findPatternInReference(String<int> & hits,TString const & reference,TPattern const & pattern)155 void findPatternInReference(String<int> & hits,
156 TString const & reference,
157 TPattern const & pattern)
158 {
159 // Check whether the pattern fits into the sequence.
160 if (length(pattern) > length(reference))
161 return;
162
163 //
164 for (unsigned pos = 0; pos < length(reference) - length(pattern) + 1; ++pos)
165 {
166 bool isHit = true;
167
168 for (unsigned posPattern = 0; posPattern < length(pattern); ++posPattern)
169 {
170 if (pattern[posPattern] != reference[posPattern + pos])
171 {
172 isHit = false;
173 break;
174 }
175 }
176 // Report the position if found a hit.
177 if (isHit)
178 appendValue(hits, pos);
179 }
180 }
181 //![findPatternInReference]
182
183 //![searchPatternPart1]
184 template <typename TString, typename TPattern>
searchPattern(StringSet<String<int>> & hitSet,StringSet<TString,Owner<JournaledSet>> const & journalSet,TPattern const & pattern)185 void searchPattern(StringSet<String<int> > & hitSet,
186 StringSet<TString, Owner<JournaledSet> > const & journalSet,
187 TPattern const & pattern)
188 {
189 typedef StringSet<TString, Owner<JournaledSet> > TJournalSet;
190 typedef typename Host<TJournalSet const>::Type THost;
191
192 // Check for valid initial state.
193 if (empty(host(journalSet)))
194 {
195 std::cout << "No reference set. Aborted search!" << std::endl;
196 return;
197 }
198
199 // Reset the hitSet to avoid phantom hits coming from a previous search.
200 clear(hitSet);
201 resize(hitSet, length(journalSet) + 1);
202 //![searchPatternPart1]
203 //![searchPatternPart2]
204 // Access the reference sequence.
205 THost & globalRef = host(journalSet);
206 // Search for pattern in the reference sequence.
207 findPatternInReference(hitSet[0], globalRef, pattern);
208 //![searchPatternPart2]
209
210 //![searchPatternPart3]
211 // Search for pattern in the journaled sequences.
212 for (unsigned i = 0; i < length(journalSet); ++i)
213 findPatternInJournalString(hitSet[i + 1], journalSet[i], pattern, hitSet[0]);
214 }
215 //![searchPatternPart3]
216
217 //![laodAndJoin]
218 template <typename TString, typename TSpec>
219 inline int
loadAndJoin(StringSet<TString,Owner<JournaledSet>> & journalSet,SeqFileIn & databaseFile,JoinConfig<TSpec> const & joinConfig)220 loadAndJoin(StringSet<TString, Owner<JournaledSet> > & journalSet,
221 SeqFileIn & databaseFile,
222 JoinConfig<TSpec> const & joinConfig)
223 {
224 typedef typename Host<TString>::Type THost;
225
226 clear(journalSet);
227
228 String<char> seqId;
229 THost sequence;
230
231 // No sequences in the fasta file!
232 if (atEnd(databaseFile))
233 {
234 std::cerr << "Empty FASTA file." << std::endl;
235 return -1;
236 }
237 // First read sequence for reference sequence.
238 readRecord(seqId, sequence, databaseFile);
239
240 // We have to create the global reference sequence otherwise we loose the information after this function terminates.
241 createHost(journalSet, sequence);
242
243 // If there are more
244 while (!atEnd(databaseFile))
245 {
246 readRecord(seqId, sequence, databaseFile);
247 appendValue(journalSet, TString(sequence));
248 join(journalSet, length(journalSet) - 1, joinConfig);
249 }
250 return 0;
251 }
252 //![laodAndJoin]
253
254 //![main]
main()255 int main()
256 {
257 // Definition of the used types.
258 typedef String<Dna, Alloc<> > TSequence;
259 typedef String<Dna, Journaled<Alloc<>, SortedArray, Alloc<> > > TJournal;
260 typedef StringSet<TJournal, Owner<JournaledSet> > TJournaledSet;
261
262 // Open the stream to the file containing the sequences.
263 CharString seqDatabasePath = getAbsolutePath("demos/tutorial/journaled_set/sequences.fasta");
264 SeqFileIn databaseFile(toCString(seqDatabasePath));
265
266 // Reading each sequence and journal them.
267 TJournaledSet journalSet;
268 JoinConfig<GlobalAlign<JournaledCompact> > joinConfig;
269 loadAndJoin(journalSet, databaseFile, joinConfig);
270
271 // Define a pattern and start search.
272 StringSet<String<int> > hitSet;
273 TSequence pattern = "GTGGT";
274 std::cout << "Search for: " << pattern << ":\n";
275 searchPattern(hitSet, journalSet, pattern);
276 //![main]
277
278 //![printResult]
279 if (empty(hitSet[0]))
280 {
281 std::cout << "No hit in reference " << std::endl;
282 }
283 else
284 {
285 std::cout << "Hit in reference " << " at ";
286 for (unsigned j = 0; j < length(hitSet[0]); ++j)
287 std::cout << hitSet[0][j] << ": " << infix(host(journalSet), hitSet[0][j], hitSet[0][j] + length(pattern)) << "\t";
288 }
289 std::cout << std::endl;
290
291 for (unsigned i = 1; i < length(hitSet); ++i)
292 {
293 if (empty(hitSet[i]))
294 {
295 std::cout << "No hit in sequence " << i - 1 << std::endl;
296 }
297 else
298 {
299 std::cout << "Hit in sequence " << i - 1 << " at ";
300 for (unsigned j = 0; j < length(hitSet[i]); ++j)
301 std::cout << hitSet[i][j] << ": " << infix(value(journalSet, i - 1), hitSet[i][j], hitSet[i][j] + length(pattern)) << "\t";
302 }
303 std::cout << std::endl;
304 }
305
306 std::cout << "Done!" << std::endl;
307 return 0;
308 }
309 //![printResult]
310