1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20 
21 package net.htmlparser.jericho;
22 
23 import java.util.*;
24 
25 /**
26  * Represents a cached map of character positions to tags for a particular tag type,
27  * or for all tag types if the tagType field is null.
28  */
29 final class SubCache {
30 	private final Cache cache;
31 	public final TagType tagType; // does not support unregistered tag types at present
32 	private final CacheEntry bof; // beginning of file marker
33 	private final CacheEntry eof; // end of file marker
34 	private CacheEntry[] array=new CacheEntry[INITIAL_CAPACITY];
35 
36 	private static final int INITIAL_CAPACITY=64;
37 
SubCache(final Cache cache, final TagType tagType)38 	public SubCache(final Cache cache, final TagType tagType) {
39 		this.cache=cache;
40 		this.tagType=tagType;
41 		array[0]=bof=new CacheEntry(0,-1,null,false,false);
42 		array[1]=eof=new CacheEntry(1,cache.getSourceLength(),null,false,false);
43 	}
44 
size()45 	public int size() {
46 		return eof.index+1;
47 	}
48 
clear()49 	public void clear() {
50 		bof.nextCached=false;
51 		eof.index=1;
52 		eof.previousCached=false;
53 		array[1]=eof;
54 	}
55 
bulkLoad_Init(final int tagCount)56 	public void bulkLoad_Init(final int tagCount) {
57 		array=new CacheEntry[tagCount+2];
58 		array[0]=bof;
59 		bof.nextCached=true;
60 		array[eof.index=tagCount+1]=eof;
61 		eof.previousCached=true;
62 	}
63 
bulkLoad_Set(final int tagsIndex, final Tag tag)64 	public void bulkLoad_Set(final int tagsIndex, final Tag tag) {
65 		final int index=tagsIndex+1;
66 		array[index]=new CacheEntry(index,tag.begin,tag,true,true);
67 	}
68 
bulkLoad_AddToTypeSpecificCache(final Tag tag)69 	public void bulkLoad_AddToTypeSpecificCache(final Tag tag) {
70 		final int index=eof.index;
71 		if (array.length==eof.index+1) doubleCapacity();
72 		array[index]=new CacheEntry(index,tag.begin,tag,true,true);
73 		eof.index++;
74 	}
75 
bulkLoad_FinaliseTypeSpecificCache()76 	public void bulkLoad_FinaliseTypeSpecificCache() {
77 		bof.nextCached=true;
78 		eof.previousCached=true;
79 		array[eof.index]=eof;
80 	}
81 
getTagAt(final int pos, final boolean serverTagOnly)82 	public Tag getTagAt(final int pos, final boolean serverTagOnly) {
83 		// This must only be called on allTagTypesSubCache (ie tagType==null)
84 		if (cache.getSourceLength()==0) return null;
85 		if (pos<0 || pos>=cache.getSourceLength()) return null;
86 		final int index=getIndexOfPos(pos);
87 		final CacheEntry cacheEntry=array[index];
88 		if (cacheEntry.pos==pos) {
89 			if (serverTagOnly && !cacheEntry.tag.getTagType().isServerTag()) return null;
90 			return cacheEntry.tag;
91 		}
92 		if (cacheEntry.previousCached) return null;
93 		return cache.addTagAt(pos,serverTagOnly);
94 	}
95 
addTagAt(final int pos, final Tag tag)96 	public void addTagAt(final int pos, final Tag tag) {
97 		final int index=getIndexOfPos(pos);
98 		final CacheEntry nextCacheEntry=array[index];
99 		final CacheEntry previousCacheEntry=getPrevious(nextCacheEntry);
100 		add(previousCacheEntry,new CacheEntry(index,pos,tag,pos==previousCacheEntry.pos+1,pos==nextCacheEntry.pos-1),nextCacheEntry);
101 	}
102 
getPreviousTag(final int pos)103 	public Tag getPreviousTag(final int pos) {
104 		// Note that this method never returns tags for which tag.includInSearch() is false, so separate caching of unregistered tags won't work.
105 		if (cache.getSourceLength()==0) return null;
106 		if (pos<0 || pos>=cache.getSourceLength()) return null;
107 		int index=getIndexOfPos(pos);
108 		final CacheEntry cacheEntry=array[index];
109 		final Tag tag;
110 		if (cacheEntry.pos==pos && cacheEntry.tag!=null && cacheEntry.tag.includeInSearch()) return cacheEntry.tag;
111 		tag=getPreviousTag(getPrevious(cacheEntry),pos,cacheEntry);
112 		addPreviousTag(pos,tag);
113 		return tag;
114 	}
115 
getNextTag(final int pos)116 	public Tag getNextTag(final int pos) {
117 		// Note that this method never returns tags for which tag.includInSearch() is false, so separate caching of unregistered tags won't work.
118 		if (cache.getSourceLength()==0) return null;
119 		if (pos<0 || pos>=cache.getSourceLength()) return null;
120 		int index=getIndexOfPos(pos);
121 		final CacheEntry cacheEntry=array[index];
122 		final Tag tag;
123 		if (cacheEntry.pos==pos) {
124 			if (cacheEntry.tag!=null && cacheEntry.tag.includeInSearch()) return cacheEntry.tag;
125 			tag=getNextTag(cacheEntry,pos,getNext(cacheEntry));
126 		} else {
127 			tag=getNextTag(getPrevious(cacheEntry),pos,cacheEntry);
128 		}
129 		addNextTag(pos,tag);
130 		return tag;
131 	}
132 
getTagIterator()133 	public Iterator<Tag> getTagIterator() {
134 		return new TagIterator();
135 	}
136 
toString()137 	public String toString() {
138 		return appendTo(new StringBuilder()).toString();
139 	}
140 
appendTo(final StringBuilder sb)141 	protected StringBuilder appendTo(final StringBuilder sb) {
142 		sb.append("Cache for TagType : ").append(tagType).append(Config.NewLine);
143 		for (int i=0; i<=lastIndex(); i++) sb.append(array[i]).append(Config.NewLine);
144 		return sb;
145 	}
146 
getPreviousTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry)147 	private Tag getPreviousTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry) {
148 		// previousCacheEntry.pos < pos <= nextCacheEntry.pos
149 		while (true) {
150 			if (!nextCacheEntry.previousCached) {
151 				final Tag tag=Tag.getPreviousTagUncached(cache.source,pos,tagType,previousCacheEntry.pos); // if useAllTypesCache is true, automatically adds tag to all caches if one is found, and maybe some unregistered tags along the way.
152 				if (tag!=null) {
153 					if (!cache.source.useAllTypesCache) addTagAt(tag.begin,tag); // have to add tag manually if useAllTypesCache is false
154 					return tag;
155 				}
156 			}
157 			if (previousCacheEntry==bof) return null;
158 			if (previousCacheEntry.tag!=null && previousCacheEntry.tag.includeInSearch()) return previousCacheEntry.tag;
159 			pos=previousCacheEntry.pos-1;
160 			previousCacheEntry=getPrevious(nextCacheEntry=previousCacheEntry);
161 		}
162 	}
163 
getNextTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry)164 	private Tag getNextTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry) {
165 		// previousCacheEntry.pos <= pos < nextCacheEntry.pos
166 		while (true) {
167 			if (!previousCacheEntry.nextCached) {
168 				final Tag tag=Tag.getNextTagUncached(cache.source,pos,tagType,nextCacheEntry.pos); // if useAllTypesCache is true, automatically adds tag to caches if one is found, and maybe some unregistered tags along the way.
169 				if (tag!=null) {
170 					if (!cache.source.useAllTypesCache) addTagAt(tag.begin,tag); // have to add tag manually if useAllTypesCache is false
171 					return tag;
172 				}
173 			}
174 			if (nextCacheEntry==eof) return null;
175 			if (nextCacheEntry.tag!=null && nextCacheEntry.tag.includeInSearch()) return nextCacheEntry.tag;
176 			pos=nextCacheEntry.pos+1;
177 			nextCacheEntry=getNext(previousCacheEntry=nextCacheEntry);
178 		}
179 	}
180 
addPreviousTag(final int pos, final Tag tag)181 	private void addPreviousTag(final int pos, final Tag tag) {
182 		final int tagPos=(tag==null) ? bof.pos : tag.begin;
183 		if (tagPos==pos) return; // the tag was found exactly on pos, so cache has already been fully updated
184 		// tagPos < pos
185 		int index=getIndexOfPos(pos);
186 		CacheEntry stepCacheEntry=array[index];
187 		// stepCacheEntry.pos is either == or > than tagPos.
188 		// stepCacheEntry.pos is either == or > pos.
189 		int compactStartIndex=Integer.MAX_VALUE;
190 		if (stepCacheEntry.pos==pos) {
191 			// a cache entry was aleady at pos (containing null or wrong tagType)
192 			stepCacheEntry.previousCached=true;
193 			if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);}
194 		} else if (!stepCacheEntry.previousCached) {
195 			// we have to add a new cacheEntry at pos:
196 			if (tagType==null)
197 				cache.addTagAt(pos,false); // this pos has never been checked before, so add it to all relevant SubCaches (a null or unregistered tag entry is always added to this SubCache)
198 			else
199 				addTagAt(pos,null); // all we know is that the pos doesn't contain a tag of this SubCache's type, so add a null entry to this SubCache only.
200 			// now we have to reload the index and stepCacheEntry as they may have changed:
201 			stepCacheEntry=array[index=getIndexOfPos(pos)];
202 			// stepCacheEntry.pos is either == or > than tagPos.
203 			// stepCacheEntry.pos is either == or > pos. (the latter if the added entry was redundant)
204 			if (stepCacheEntry.pos==pos) {
205 				// perform same steps as in the (stepCacheEntry.pos==pos) if condition above:
206 				stepCacheEntry.previousCached=true;
207 				if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);}
208 			}
209 		}
210 		while (true) {
211 			stepCacheEntry=array[--index];
212 			if (stepCacheEntry.pos<=tagPos) break;
213 			if (stepCacheEntry.tag!=null) {
214 				if (stepCacheEntry.tag.includeInSearch()) throw new SourceCacheEntryMissingInternalError(tagType,tag,this);
215 				stepCacheEntry.previousCached=true;
216 				stepCacheEntry.nextCached=true;
217 			} else {
218 				stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);
219 			}
220 		}
221 		if (stepCacheEntry.pos!=tagPos) throw new FoundCacheEntryMissingInternalError(tagType,tag,this);
222 		stepCacheEntry.nextCached=true;
223 		compact(compactStartIndex);
224 	}
225 
addNextTag(final int pos, final Tag tag)226 	private void addNextTag(final int pos, final Tag tag) {
227 		final int tagPos=(tag==null) ? eof.pos : tag.begin;
228 		if (tagPos==pos) return; // the tag was found exactly on pos, so cache has already been fully updated
229 		// tagPos > pos
230 		int index=getIndexOfPos(pos);
231 		CacheEntry stepCacheEntry=array[index];
232 		// stepCacheEntry.pos may be <, == or > than tagPos.
233 		// stepCacheEntry.pos is either == or > pos.
234 		int compactStartIndex=Integer.MAX_VALUE;
235 		if (stepCacheEntry.pos==pos) {
236 			// a cache entry was aleady at pos (containing null or wrong tagType)
237 			stepCacheEntry.nextCached=true;
238 			if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);}
239 		} else if (!getPrevious(stepCacheEntry).nextCached) {
240 			// we have to add a new cacheEntry at pos:
241 			if (tagType==null)
242 				cache.addTagAt(pos,false); // this pos has never been checked before, so add it to all relevant SubCaches (a null or unregistered tag entry is always added to this SubCache)
243 			else
244 				addTagAt(pos,null); // all we know is that the pos doesn't contain a tag of this SubCache's type, so add a null entry to this SubCache only.
245 			// now we have to reload the index and stepCacheEntry as they may have changed:
246 			stepCacheEntry=array[index=getIndexOfPos(pos)];
247 			// stepCacheEntry.pos may be <, == or > than tagPos.
248 			// stepCacheEntry.pos is either == or > pos. (the latter if the added entry was redundant)
249 			if (stepCacheEntry.pos==pos) {
250 				// perform same steps as in the (stepCacheEntry.pos==pos) if condition above:
251 				stepCacheEntry.nextCached=true;
252 				if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);}
253 			}
254 		}
255 		if (stepCacheEntry.pos<tagPos) {
256 			while (true) {
257 				stepCacheEntry=array[++index];
258 				if (stepCacheEntry.pos>=tagPos) break;
259 				if (stepCacheEntry.tag!=null) {
260 					if (stepCacheEntry.tag.includeInSearch()) throw new SourceCacheEntryMissingInternalError(tagType,tag,this);
261 					stepCacheEntry.previousCached=true;
262 					stepCacheEntry.nextCached=true;
263 				} else {
264 					stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);
265 				}
266 			}
267 			if (stepCacheEntry.pos!=tagPos) throw new FoundCacheEntryMissingInternalError(tagType,tag,this);
268 		}
269 		stepCacheEntry.previousCached=true;
270 		compact(compactStartIndex);
271 	}
272 
compact(int i)273 	private void compact(int i) {
274 		final int lastIndex=lastIndex();
275 		int removedCount=1;
276 		while (i<lastIndex) {
277 			final CacheEntry cacheEntry=array[++i];
278 			if (cacheEntry.removed)
279 				removedCount++;
280 			else
281 				array[cacheEntry.index=i-removedCount]=cacheEntry;
282 		}
283 	}
284 
add(final CacheEntry previousCacheEntry, final CacheEntry newCacheEntry, final CacheEntry nextCacheEntry)285 	private void add(final CacheEntry previousCacheEntry, final CacheEntry newCacheEntry, final CacheEntry nextCacheEntry) {
286 		if (!newCacheEntry.isRedundant()) insert(newCacheEntry);
287 		if (newCacheEntry.previousCached) {
288 			previousCacheEntry.nextCached=true;
289 			if (previousCacheEntry.isRedundant()) remove(previousCacheEntry);
290 		}
291 		if (newCacheEntry.nextCached) {
292 			nextCacheEntry.previousCached=true;
293 			if (nextCacheEntry.isRedundant()) remove(nextCacheEntry);
294 		}
295 	}
296 
getIndexOfPos(final int pos)297 	private int getIndexOfPos(final int pos) {
298 		// return the index of the cacheEntry at pos, or the index where it would be inserted if it does not exist.
299 		int minIndex=0;
300 		int maxIndex=lastIndex();
301 		// using the following complex calculation instead of a binary search is likely to result in less iterations but is slower overall:
302 		// int index=(pos*maxIndex)/cache.getSourceLength(); // approximate first guess at index, assuming even distribution of cache entries
303 		int index=maxIndex>>1;
304 		while (true) {
305 			final CacheEntry cacheEntry=array[index];
306 			if (pos>cacheEntry.pos) {
307 				final CacheEntry nextCacheEntry=getNext(cacheEntry);
308 				if (pos<=nextCacheEntry.pos) return nextCacheEntry.index;
309 				minIndex=nextCacheEntry.index;
310 			} else if (pos<cacheEntry.pos) {
311 				final CacheEntry previousCacheEntry=getPrevious(cacheEntry);
312 				if (pos==previousCacheEntry.pos) return previousCacheEntry.index;
313 				if (pos>previousCacheEntry.pos) return index;
314 				maxIndex=previousCacheEntry.index;
315 			} else {
316 				return index;
317 			}
318 			index=(minIndex+maxIndex)>>1;
319 			// using the following complex calculation instead of a binary search is likely to result in less iterations but is slower overall:
320 			// final int minIndexPos=array[minIndex].pos;
321 			// index=((maxIndex-minIndex-1)*(pos-minIndexPos))/(array[maxIndex].pos-minIndexPos)+minIndex+1; // approximate next guess at index, assuming even distribution of cache entries between min and max entries
322 		}
323 	}
324 
getNext(final CacheEntry cacheEntry)325 	private CacheEntry getNext(final CacheEntry cacheEntry) {
326 		return array[cacheEntry.index+1];
327 	}
328 
getPrevious(final CacheEntry cacheEntry)329 	private CacheEntry getPrevious(final CacheEntry cacheEntry) {
330 		return array[cacheEntry.index-1];
331 	}
332 
lastIndex()333 	private int lastIndex() {
334 		return eof.index;
335 	}
336 
insert(final CacheEntry cacheEntry)337 	private void insert(final CacheEntry cacheEntry) {
338 		final int index=cacheEntry.index;
339 		if (array.length==size()) doubleCapacity();
340 		for (int i=lastIndex(); i>=index; i--) {
341 			final CacheEntry movedCacheEntry=array[i];
342 			array[movedCacheEntry.index=(i+1)]=movedCacheEntry;
343 		}
344 		array[index]=cacheEntry;
345 	}
346 
remove(final CacheEntry cacheEntry)347 	private void remove(final CacheEntry cacheEntry) {
348 		final int lastIndex=lastIndex();
349 		for (int i=cacheEntry.index; i<lastIndex; i++) {
350 			final CacheEntry movedCacheEntry=array[i+1];
351 			array[movedCacheEntry.index=i]=movedCacheEntry;
352 		}
353 	}
354 
doubleCapacity()355 	private void doubleCapacity() {
356 		// assumes size==array.length
357 		final CacheEntry[] newArray=new CacheEntry[array.length << 1];
358 		for (int i=lastIndex(); i>=0; i--) newArray[i]=array[i];
359 		array=newArray;
360 	}
361 
362 	private static class CacheEntryMissingInternalError extends AssertionError {
CacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache, final String message)363 		public CacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache, final String message) {
364 			super("INTERNAL ERROR: Inconsistent Cache State for TagType \""+tagType+"\" - "+message+' '+tag.getDebugInfo()+'\n'+subCache);
365 		}
366 	}
367 
368 	private static class SourceCacheEntryMissingInternalError extends CacheEntryMissingInternalError {
SourceCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache)369 		public SourceCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache) {
370 			super(tagType,tag,subCache,"cache entry no longer found in source:");
371 		}
372 	}
373 
374 	private static class FoundCacheEntryMissingInternalError extends CacheEntryMissingInternalError {
FoundCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache)375 		public FoundCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache) {
376 			super(tagType,tag,subCache,"missing cache entry for found tag");
377 		}
378 	}
379 
380 	private final class TagIterator implements Iterator<Tag> {
381 		private int i=0;
382 		private Tag nextTag;
TagIterator()383 		public TagIterator() {
384 			loadNextTag();
385 		}
hasNext()386 		public boolean hasNext() {
387 			return nextTag!=null;
388 		}
next()389 		public Tag next() {
390 			final Tag result=nextTag;
391 			loadNextTag();
392 			return result;
393 		}
remove()394 		public void remove() {
395 			throw new UnsupportedOperationException();
396 		}
loadNextTag()397 		private void loadNextTag() {
398 			while (++i<=lastIndex() && (nextTag=array[i].tag)==null) {}
399 		}
400 	}
401 
402 	private static final class CacheEntry {
403 		public int index;
404 		public final int pos;
405 		public final Tag tag;
406 		public boolean previousCached;
407 		public boolean nextCached;
408 		public boolean removed=false;
409 
CacheEntry(final int index, final int pos, final Tag tag, final boolean previousCached, final boolean nextCached)410 		public CacheEntry(final int index, final int pos, final Tag tag, final boolean previousCached, final boolean nextCached) {
411 			this.index=index;
412 			this.pos=pos;
413 			this.tag=tag;
414 			this.previousCached=previousCached;
415 			this.nextCached=nextCached;
416 		}
417 
isRedundant()418 		public boolean isRedundant() {
419 			return tag==null && previousCached && nextCached;
420 		}
421 
toString()422 		public String toString() {
423 			return pad(index,4)+" "+pad(pos,5)+" "+(previousCached?'|':'-')+' '+(nextCached?'|':'-')+' '+(tag==null ? "null" : tag.getDebugInfo());
424 		}
425 
pad(final int n, final int places)426 		private String pad(final int n, final int places) {
427 			final String nstring=String.valueOf(n);
428 			final StringBuilder sb=new StringBuilder(places);
429 			for (int i=places-nstring.length(); i>0; i--) sb.append(' ');
430 			sb.append(nstring);
431 			return sb.toString();
432 		}
433 	}
434 }
435