1 //![include]
2 #include <iostream>
3 #include <seqan/seq_io.h>
4 #include <seqan/journaled_set.h>
5 
6 using namespace seqan;
7 //![include]
8 
9 //![searchAtBorder]
10 template <typename TJournalEntriesIterator, typename TJournal, typename TPattern>
_searchAtBorder(String<int> & hitTarget,TJournalEntriesIterator & entriesIt,TJournal const & journal,TPattern const & pattern)11 void _searchAtBorder(String<int> & hitTarget,
12                      TJournalEntriesIterator & entriesIt,
13                      TJournal const & journal,
14                      TPattern const & pattern)
15 {
16     typedef typename Iterator<TJournal const, Standard>::Type TJournalIterator;
17 
18     // Define region before the border to the next node to search for the pattern.
19     TJournalIterator nodeIter = iter(journal, entriesIt->virtualPosition +
20                                      _max(0, (int) entriesIt->length - (int) length(pattern) + 1));
21     // Define end of search region.
22     TJournalIterator nodeEnd = iter(journal, _min(entriesIt->virtualPosition + entriesIt->length, length(journal) -
23                                                   length(pattern) + 1));
24     // Move step by step over search region.
25     if (nodeEnd == end(journal))
26         return;
27 
28     for (; nodeIter != nodeEnd; ++nodeIter)
29     {
30         // Define compare iterator.
31         TJournalIterator verifyIter = nodeIter;
32         bool isHit = true;
33         // Compare pattern with current search window.
34         for (unsigned posPattern = 0; posPattern < length(pattern); ++posPattern, ++verifyIter)
35         {
36             // Comparing the pattern value with the current value of the iterator.
37             if (pattern[posPattern] != getValue(verifyIter))
38             {
39                 isHit = false;
40                 break;
41             }
42         }
43         // Report hit if found.
44         if (isHit)
45             appendValue(hitTarget, position(nodeIter));
46     }
47 }
48 //![searchAtBorder]
49 
50 //![findInPatchNodePart1]
51 template <typename TJournalEntriesIterator, typename TJournal, typename TPattern>
_findInPatchNode(String<int> & hitTarget,TJournalEntriesIterator & entriesIt,TJournal const & journal,TPattern const & pattern)52 void _findInPatchNode(String<int> & hitTarget,
53                       TJournalEntriesIterator & entriesIt,
54                       TJournal const & journal,
55                       TPattern const & pattern)
56 {
57     typedef typename Iterator<TJournal const, Standard>::Type TJournalIterator;
58 //![findInPatchNodePart1]
59 
60 //![findInPatchNodePart2]
61     // Search for pattern in the insertion node.
62     TJournalIterator patchIter = iter(journal, entriesIt->virtualPosition);
63     TJournalIterator patchEnd = patchIter + _max(0, (int)entriesIt->length - (int)length(pattern) + 1);
64     // Move step by step over search region.
65     for (; patchIter != patchEnd; ++patchIter)
66     {
67 //![findInPatchNodePart2]
68 //![findInPatchNodePart3]
69         TJournalIterator verifyIter = patchIter;
70         bool isHit = true;
71         // Search for pattern in the insertion node.
72         for (unsigned posPattern = 0; posPattern < length(pattern); ++posPattern, ++verifyIter)
73         {
74             // Comparing the pattern value with the current value of the iterator.
75             if (pattern[posPattern] != getValue(verifyIter))
76             {
77                 isHit = false;
78                 break;
79             }
80         }
81         if (isHit)
82             appendValue(hitTarget, position(patchIter));
83     }
84 }
85 //![findInPatchNodePart3]
86 
87 //![findInOriginalNode]
88 template <typename TJournalEntriesIterator, typename TPattern>
_findInOriginalNode(String<int> & hitTarget,TJournalEntriesIterator & entriesIt,TPattern const & pattern,String<int> const & refHits)89 void _findInOriginalNode(String<int> & hitTarget,
90                          TJournalEntriesIterator & entriesIt,
91                          TPattern const & pattern,
92                          String<int> const & refHits)
93 {
94     // Define an Iterator which iterates over the reference hit set.
95     typedef typename Iterator<String<int> const, Standard>::Type THitIterator;
96 
97     // Check if hits exist in the reference.
98     if (!empty(refHits))
99     {
100         // Find upper bound to physical position in sorted refHits.
101         THitIterator itHit = std::upper_bound(begin(refHits), end(refHits), (int)entriesIt->physicalPosition);
102         // Make sure we do not miss hits that begin at physical position of current node.
103         if (itHit != begin(refHits) && *(itHit - 1) >= (int)entriesIt->physicalPosition)
104             --itHit;
105         // Store all hits that are found in the region of the reference which is covered by this node.
106         while ((int)*itHit < ((int)entriesIt->physicalPosition + (int)entriesIt->length - (int)length(pattern) + 1) && itHit != end(refHits))
107         {
108             appendValue(hitTarget, entriesIt->virtualPosition + (*itHit - (int)entriesIt->physicalPosition));
109             ++itHit;
110         }
111     }
112 }
113 //![findInOriginalNode]
114 
115 //![findPatternInJournalStringPart1]
116 template <typename TValue, typename THostSpec, typename TJournalSpec, typename TBufferSpec, typename TPattern>
findPatternInJournalString(String<int> & hitTarget,String<TValue,Journaled<THostSpec,TJournalSpec,TBufferSpec>> const & journal,TPattern const & pattern,String<int> const & refHits)117 void findPatternInJournalString(String<int> & hitTarget,
118                                 String<TValue, Journaled<THostSpec, TJournalSpec, TBufferSpec> > const & journal,
119                                 TPattern const & pattern,
120                                 String<int> const & refHits)
121 {
122     typedef String<TValue, Journaled<THostSpec, TJournalSpec, TBufferSpec> > const TJournal;
123     typedef typename JournalType<TJournal>::Type TJournalEntries;
124     typedef typename Iterator<TJournalEntries>::Type TJournalEntriesIterator;
125 
126     if (length(pattern) > length(journal))
127         return;
128 //![findPatternInJournalStringPart1]
129 
130 //![findPatternInJournalStringPart2]
131     TJournalEntriesIterator it = begin(journal._journalEntries);
132     TJournalEntriesIterator itEnd = findInJournalEntries(journal._journalEntries, length(journal) - length(pattern) + 1) + 1;
133 //![findPatternInJournalStringPart2]
134 
135 //![findPatternInJournalStringPart3]
136     while (it != itEnd)
137     {
138         if (it->segmentSource == SOURCE_ORIGINAL) // Find a possible hit in the current source vertex.
139         {
140             _findInOriginalNode(hitTarget, it, pattern, refHits);
141         }
142         if (it->segmentSource == SOURCE_PATCH) // Search for pattern within the patch node.
143         {
144             _findInPatchNode(hitTarget, it, journal, pattern);
145         }
146         // Scan the border for a possible match.
147         _searchAtBorder(hitTarget, it, journal, pattern);
148         ++it;
149     }
150 }
151 //![findPatternInJournalStringPart3]
152 
153 //![findPatternInReference]
154 template <typename TString, typename TPattern>
findPatternInReference(String<int> & hits,TString const & reference,TPattern const & pattern)155 void findPatternInReference(String<int> & hits,
156                             TString const & reference,
157                             TPattern const & pattern)
158 {
159     // Check whether the pattern fits into the sequence.
160     if (length(pattern) > length(reference))
161         return;
162 
163     //
164     for (unsigned pos = 0; pos < length(reference) - length(pattern) + 1; ++pos)
165     {
166         bool isHit = true;
167 
168         for (unsigned posPattern = 0; posPattern < length(pattern); ++posPattern)
169         {
170             if (pattern[posPattern] != reference[posPattern + pos])
171             {
172                 isHit = false;
173                 break;
174             }
175         }
176         // Report the position if found a hit.
177         if (isHit)
178             appendValue(hits, pos);
179     }
180 }
181 //![findPatternInReference]
182 
183 //![searchPatternPart1]
184 template <typename TString, typename TPattern>
searchPattern(StringSet<String<int>> & hitSet,StringSet<TString,Owner<JournaledSet>> const & journalSet,TPattern const & pattern)185 void searchPattern(StringSet<String<int> > & hitSet,
186                    StringSet<TString, Owner<JournaledSet> > const & journalSet,
187                    TPattern const & pattern)
188 {
189     typedef StringSet<TString, Owner<JournaledSet> > TJournalSet;
190     typedef typename Host<TJournalSet const>::Type THost;
191 
192     // Check for valid initial state.
193     if (empty(host(journalSet)))
194     {
195         std::cout << "No reference set. Aborted search!" << std::endl;
196         return;
197     }
198 
199     // Reset the hitSet to avoid phantom hits coming from a previous search.
200     clear(hitSet);
201     resize(hitSet, length(journalSet) + 1);
202 //![searchPatternPart1]
203 //![searchPatternPart2]
204     // Access the reference sequence.
205     THost & globalRef = host(journalSet);
206     // Search for pattern in the reference sequence.
207     findPatternInReference(hitSet[0], globalRef, pattern);
208 //![searchPatternPart2]
209 
210 //![searchPatternPart3]
211     // Search for pattern in the journaled sequences.
212     for (unsigned i = 0; i < length(journalSet); ++i)
213         findPatternInJournalString(hitSet[i + 1], journalSet[i], pattern, hitSet[0]);
214 }
215 //![searchPatternPart3]
216 
217 //![laodAndJoin]
218 template <typename TString, typename TSpec>
219 inline int
loadAndJoin(StringSet<TString,Owner<JournaledSet>> & journalSet,SeqFileIn & databaseFile,JoinConfig<TSpec> const & joinConfig)220 loadAndJoin(StringSet<TString, Owner<JournaledSet> > & journalSet,
221             SeqFileIn & databaseFile,
222             JoinConfig<TSpec> const & joinConfig)
223 {
224     typedef typename Host<TString>::Type THost;
225 
226     clear(journalSet);
227 
228     String<char> seqId;
229     THost sequence;
230 
231     // No sequences in the fasta file!
232     if (atEnd(databaseFile))
233     {
234         std::cerr << "Empty FASTA file." << std::endl;
235         return -1;
236     }
237     // First read sequence for reference sequence.
238     readRecord(seqId, sequence, databaseFile);
239 
240     // We have to create the global reference sequence otherwise we loose the information after this function terminates.
241     createHost(journalSet, sequence);
242 
243     // If there are more
244     while (!atEnd(databaseFile))
245     {
246         readRecord(seqId, sequence, databaseFile);
247         appendValue(journalSet, TString(sequence));
248         join(journalSet, length(journalSet) - 1, joinConfig);
249     }
250     return 0;
251 }
252 //![laodAndJoin]
253 
254 //![main]
main()255 int main()
256 {
257     // Definition of the used types.
258     typedef String<Dna, Alloc<> > TSequence;
259     typedef String<Dna, Journaled<Alloc<>, SortedArray, Alloc<> > > TJournal;
260     typedef StringSet<TJournal, Owner<JournaledSet> > TJournaledSet;
261 
262     // Open the stream to the file containing the sequences.
263     CharString seqDatabasePath = getAbsolutePath("demos/tutorial/journaled_set/sequences.fasta");
264     SeqFileIn databaseFile(toCString(seqDatabasePath));
265 
266     // Reading each sequence and journal them.
267     TJournaledSet journalSet;
268     JoinConfig<GlobalAlign<JournaledCompact> > joinConfig;
269     loadAndJoin(journalSet, databaseFile, joinConfig);
270 
271     // Define a pattern and start search.
272     StringSet<String<int> > hitSet;
273     TSequence pattern = "GTGGT";
274     std::cout << "Search for: " << pattern << ":\n";
275     searchPattern(hitSet, journalSet, pattern);
276 //![main]
277 
278 //![printResult]
279     if (empty(hitSet[0]))
280     {
281         std::cout << "No hit in reference " << std::endl;
282     }
283     else
284     {
285         std::cout << "Hit in reference " << " at ";
286         for (unsigned j = 0; j < length(hitSet[0]); ++j)
287             std::cout << hitSet[0][j] << ": " << infix(host(journalSet), hitSet[0][j], hitSet[0][j] + length(pattern)) << "\t";
288     }
289     std::cout << std::endl;
290 
291     for (unsigned i = 1; i < length(hitSet); ++i)
292     {
293         if (empty(hitSet[i]))
294         {
295             std::cout << "No hit in sequence " << i - 1 << std::endl;
296         }
297         else
298         {
299             std::cout << "Hit in sequence " << i - 1 << " at ";
300             for (unsigned j = 0; j < length(hitSet[i]); ++j)
301                 std::cout << hitSet[i][j] << ": " << infix(value(journalSet, i - 1), hitSet[i][j], hitSet[i][j] + length(pattern)) << "\t";
302         }
303         std::cout << std::endl;
304     }
305 
306     std::cout << "Done!" << std::endl;
307     return 0;
308 }
309 //![printResult]
310