1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML 2 // Version 3.2 3 // Copyright (C) 2004-2009 Martin Jericho 4 // http://jericho.htmlparser.net/ 5 // 6 // This library is free software; you can redistribute it and/or 7 // modify it under the terms of either one of the following licences: 8 // 9 // 1. The Eclipse Public License (EPL) version 1.0, 10 // included in this distribution in the file licence-epl-1.0.html 11 // or available at http://www.eclipse.org/legal/epl-v10.html 12 // 13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, 14 // included in this distribution in the file licence-lgpl-2.1.txt 15 // or available at http://www.gnu.org/licenses/lgpl.txt 16 // 17 // This library is distributed on an "AS IS" basis, 18 // WITHOUT WARRANTY OF ANY KIND, either express or implied. 19 // See the individual licence texts for more details. 20 21 package net.htmlparser.jericho; 22 23 import java.util.*; 24 25 /** 26 * Represents a cached map of character positions to tags for a particular tag type, 27 * or for all tag types if the tagType field is null. 28 */ 29 final class SubCache { 30 private final Cache cache; 31 public final TagType tagType; // does not support unregistered tag types at present 32 private final CacheEntry bof; // beginning of file marker 33 private final CacheEntry eof; // end of file marker 34 private CacheEntry[] array=new CacheEntry[INITIAL_CAPACITY]; 35 36 private static final int INITIAL_CAPACITY=64; 37 SubCache(final Cache cache, final TagType tagType)38 public SubCache(final Cache cache, final TagType tagType) { 39 this.cache=cache; 40 this.tagType=tagType; 41 array[0]=bof=new CacheEntry(0,-1,null,false,false); 42 array[1]=eof=new CacheEntry(1,cache.getSourceLength(),null,false,false); 43 } 44 size()45 public int size() { 46 return eof.index+1; 47 } 48 clear()49 public void clear() { 50 bof.nextCached=false; 51 eof.index=1; 52 eof.previousCached=false; 53 array[1]=eof; 54 } 55 bulkLoad_Init(final int tagCount)56 public void bulkLoad_Init(final int tagCount) { 57 array=new CacheEntry[tagCount+2]; 58 array[0]=bof; 59 bof.nextCached=true; 60 array[eof.index=tagCount+1]=eof; 61 eof.previousCached=true; 62 } 63 bulkLoad_Set(final int tagsIndex, final Tag tag)64 public void bulkLoad_Set(final int tagsIndex, final Tag tag) { 65 final int index=tagsIndex+1; 66 array[index]=new CacheEntry(index,tag.begin,tag,true,true); 67 } 68 bulkLoad_AddToTypeSpecificCache(final Tag tag)69 public void bulkLoad_AddToTypeSpecificCache(final Tag tag) { 70 final int index=eof.index; 71 if (array.length==eof.index+1) doubleCapacity(); 72 array[index]=new CacheEntry(index,tag.begin,tag,true,true); 73 eof.index++; 74 } 75 bulkLoad_FinaliseTypeSpecificCache()76 public void bulkLoad_FinaliseTypeSpecificCache() { 77 bof.nextCached=true; 78 eof.previousCached=true; 79 array[eof.index]=eof; 80 } 81 getTagAt(final int pos, final boolean serverTagOnly)82 public Tag getTagAt(final int pos, final boolean serverTagOnly) { 83 // This must only be called on allTagTypesSubCache (ie tagType==null) 84 if (cache.getSourceLength()==0) return null; 85 if (pos<0 || pos>=cache.getSourceLength()) return null; 86 final int index=getIndexOfPos(pos); 87 final CacheEntry cacheEntry=array[index]; 88 if (cacheEntry.pos==pos) { 89 if (serverTagOnly && !cacheEntry.tag.getTagType().isServerTag()) return null; 90 return cacheEntry.tag; 91 } 92 if (cacheEntry.previousCached) return null; 93 return cache.addTagAt(pos,serverTagOnly); 94 } 95 addTagAt(final int pos, final Tag tag)96 public void addTagAt(final int pos, final Tag tag) { 97 final int index=getIndexOfPos(pos); 98 final CacheEntry nextCacheEntry=array[index]; 99 final CacheEntry previousCacheEntry=getPrevious(nextCacheEntry); 100 add(previousCacheEntry,new CacheEntry(index,pos,tag,pos==previousCacheEntry.pos+1,pos==nextCacheEntry.pos-1),nextCacheEntry); 101 } 102 getPreviousTag(final int pos)103 public Tag getPreviousTag(final int pos) { 104 // Note that this method never returns tags for which tag.includInSearch() is false, so separate caching of unregistered tags won't work. 105 if (cache.getSourceLength()==0) return null; 106 if (pos<0 || pos>=cache.getSourceLength()) return null; 107 int index=getIndexOfPos(pos); 108 final CacheEntry cacheEntry=array[index]; 109 final Tag tag; 110 if (cacheEntry.pos==pos && cacheEntry.tag!=null && cacheEntry.tag.includeInSearch()) return cacheEntry.tag; 111 tag=getPreviousTag(getPrevious(cacheEntry),pos,cacheEntry); 112 addPreviousTag(pos,tag); 113 return tag; 114 } 115 getNextTag(final int pos)116 public Tag getNextTag(final int pos) { 117 // Note that this method never returns tags for which tag.includInSearch() is false, so separate caching of unregistered tags won't work. 118 if (cache.getSourceLength()==0) return null; 119 if (pos<0 || pos>=cache.getSourceLength()) return null; 120 int index=getIndexOfPos(pos); 121 final CacheEntry cacheEntry=array[index]; 122 final Tag tag; 123 if (cacheEntry.pos==pos) { 124 if (cacheEntry.tag!=null && cacheEntry.tag.includeInSearch()) return cacheEntry.tag; 125 tag=getNextTag(cacheEntry,pos,getNext(cacheEntry)); 126 } else { 127 tag=getNextTag(getPrevious(cacheEntry),pos,cacheEntry); 128 } 129 addNextTag(pos,tag); 130 return tag; 131 } 132 getTagIterator()133 public Iterator<Tag> getTagIterator() { 134 return new TagIterator(); 135 } 136 toString()137 public String toString() { 138 return appendTo(new StringBuilder()).toString(); 139 } 140 appendTo(final StringBuilder sb)141 protected StringBuilder appendTo(final StringBuilder sb) { 142 sb.append("Cache for TagType : ").append(tagType).append(Config.NewLine); 143 for (int i=0; i<=lastIndex(); i++) sb.append(array[i]).append(Config.NewLine); 144 return sb; 145 } 146 getPreviousTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry)147 private Tag getPreviousTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry) { 148 // previousCacheEntry.pos < pos <= nextCacheEntry.pos 149 while (true) { 150 if (!nextCacheEntry.previousCached) { 151 final Tag tag=Tag.getPreviousTagUncached(cache.source,pos,tagType,previousCacheEntry.pos); // if useAllTypesCache is true, automatically adds tag to all caches if one is found, and maybe some unregistered tags along the way. 152 if (tag!=null) { 153 if (!cache.source.useAllTypesCache) addTagAt(tag.begin,tag); // have to add tag manually if useAllTypesCache is false 154 return tag; 155 } 156 } 157 if (previousCacheEntry==bof) return null; 158 if (previousCacheEntry.tag!=null && previousCacheEntry.tag.includeInSearch()) return previousCacheEntry.tag; 159 pos=previousCacheEntry.pos-1; 160 previousCacheEntry=getPrevious(nextCacheEntry=previousCacheEntry); 161 } 162 } 163 getNextTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry)164 private Tag getNextTag(CacheEntry previousCacheEntry, int pos, CacheEntry nextCacheEntry) { 165 // previousCacheEntry.pos <= pos < nextCacheEntry.pos 166 while (true) { 167 if (!previousCacheEntry.nextCached) { 168 final Tag tag=Tag.getNextTagUncached(cache.source,pos,tagType,nextCacheEntry.pos); // if useAllTypesCache is true, automatically adds tag to caches if one is found, and maybe some unregistered tags along the way. 169 if (tag!=null) { 170 if (!cache.source.useAllTypesCache) addTagAt(tag.begin,tag); // have to add tag manually if useAllTypesCache is false 171 return tag; 172 } 173 } 174 if (nextCacheEntry==eof) return null; 175 if (nextCacheEntry.tag!=null && nextCacheEntry.tag.includeInSearch()) return nextCacheEntry.tag; 176 pos=nextCacheEntry.pos+1; 177 nextCacheEntry=getNext(previousCacheEntry=nextCacheEntry); 178 } 179 } 180 addPreviousTag(final int pos, final Tag tag)181 private void addPreviousTag(final int pos, final Tag tag) { 182 final int tagPos=(tag==null) ? bof.pos : tag.begin; 183 if (tagPos==pos) return; // the tag was found exactly on pos, so cache has already been fully updated 184 // tagPos < pos 185 int index=getIndexOfPos(pos); 186 CacheEntry stepCacheEntry=array[index]; 187 // stepCacheEntry.pos is either == or > than tagPos. 188 // stepCacheEntry.pos is either == or > pos. 189 int compactStartIndex=Integer.MAX_VALUE; 190 if (stepCacheEntry.pos==pos) { 191 // a cache entry was aleady at pos (containing null or wrong tagType) 192 stepCacheEntry.previousCached=true; 193 if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);} 194 } else if (!stepCacheEntry.previousCached) { 195 // we have to add a new cacheEntry at pos: 196 if (tagType==null) 197 cache.addTagAt(pos,false); // this pos has never been checked before, so add it to all relevant SubCaches (a null or unregistered tag entry is always added to this SubCache) 198 else 199 addTagAt(pos,null); // all we know is that the pos doesn't contain a tag of this SubCache's type, so add a null entry to this SubCache only. 200 // now we have to reload the index and stepCacheEntry as they may have changed: 201 stepCacheEntry=array[index=getIndexOfPos(pos)]; 202 // stepCacheEntry.pos is either == or > than tagPos. 203 // stepCacheEntry.pos is either == or > pos. (the latter if the added entry was redundant) 204 if (stepCacheEntry.pos==pos) { 205 // perform same steps as in the (stepCacheEntry.pos==pos) if condition above: 206 stepCacheEntry.previousCached=true; 207 if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);} 208 } 209 } 210 while (true) { 211 stepCacheEntry=array[--index]; 212 if (stepCacheEntry.pos<=tagPos) break; 213 if (stepCacheEntry.tag!=null) { 214 if (stepCacheEntry.tag.includeInSearch()) throw new SourceCacheEntryMissingInternalError(tagType,tag,this); 215 stepCacheEntry.previousCached=true; 216 stepCacheEntry.nextCached=true; 217 } else { 218 stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index); 219 } 220 } 221 if (stepCacheEntry.pos!=tagPos) throw new FoundCacheEntryMissingInternalError(tagType,tag,this); 222 stepCacheEntry.nextCached=true; 223 compact(compactStartIndex); 224 } 225 addNextTag(final int pos, final Tag tag)226 private void addNextTag(final int pos, final Tag tag) { 227 final int tagPos=(tag==null) ? eof.pos : tag.begin; 228 if (tagPos==pos) return; // the tag was found exactly on pos, so cache has already been fully updated 229 // tagPos > pos 230 int index=getIndexOfPos(pos); 231 CacheEntry stepCacheEntry=array[index]; 232 // stepCacheEntry.pos may be <, == or > than tagPos. 233 // stepCacheEntry.pos is either == or > pos. 234 int compactStartIndex=Integer.MAX_VALUE; 235 if (stepCacheEntry.pos==pos) { 236 // a cache entry was aleady at pos (containing null or wrong tagType) 237 stepCacheEntry.nextCached=true; 238 if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);} 239 } else if (!getPrevious(stepCacheEntry).nextCached) { 240 // we have to add a new cacheEntry at pos: 241 if (tagType==null) 242 cache.addTagAt(pos,false); // this pos has never been checked before, so add it to all relevant SubCaches (a null or unregistered tag entry is always added to this SubCache) 243 else 244 addTagAt(pos,null); // all we know is that the pos doesn't contain a tag of this SubCache's type, so add a null entry to this SubCache only. 245 // now we have to reload the index and stepCacheEntry as they may have changed: 246 stepCacheEntry=array[index=getIndexOfPos(pos)]; 247 // stepCacheEntry.pos may be <, == or > than tagPos. 248 // stepCacheEntry.pos is either == or > pos. (the latter if the added entry was redundant) 249 if (stepCacheEntry.pos==pos) { 250 // perform same steps as in the (stepCacheEntry.pos==pos) if condition above: 251 stepCacheEntry.nextCached=true; 252 if (stepCacheEntry.isRedundant()) {stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index);} 253 } 254 } 255 if (stepCacheEntry.pos<tagPos) { 256 while (true) { 257 stepCacheEntry=array[++index]; 258 if (stepCacheEntry.pos>=tagPos) break; 259 if (stepCacheEntry.tag!=null) { 260 if (stepCacheEntry.tag.includeInSearch()) throw new SourceCacheEntryMissingInternalError(tagType,tag,this); 261 stepCacheEntry.previousCached=true; 262 stepCacheEntry.nextCached=true; 263 } else { 264 stepCacheEntry.removed=true; compactStartIndex=Math.min(compactStartIndex,stepCacheEntry.index); 265 } 266 } 267 if (stepCacheEntry.pos!=tagPos) throw new FoundCacheEntryMissingInternalError(tagType,tag,this); 268 } 269 stepCacheEntry.previousCached=true; 270 compact(compactStartIndex); 271 } 272 compact(int i)273 private void compact(int i) { 274 final int lastIndex=lastIndex(); 275 int removedCount=1; 276 while (i<lastIndex) { 277 final CacheEntry cacheEntry=array[++i]; 278 if (cacheEntry.removed) 279 removedCount++; 280 else 281 array[cacheEntry.index=i-removedCount]=cacheEntry; 282 } 283 } 284 add(final CacheEntry previousCacheEntry, final CacheEntry newCacheEntry, final CacheEntry nextCacheEntry)285 private void add(final CacheEntry previousCacheEntry, final CacheEntry newCacheEntry, final CacheEntry nextCacheEntry) { 286 if (!newCacheEntry.isRedundant()) insert(newCacheEntry); 287 if (newCacheEntry.previousCached) { 288 previousCacheEntry.nextCached=true; 289 if (previousCacheEntry.isRedundant()) remove(previousCacheEntry); 290 } 291 if (newCacheEntry.nextCached) { 292 nextCacheEntry.previousCached=true; 293 if (nextCacheEntry.isRedundant()) remove(nextCacheEntry); 294 } 295 } 296 getIndexOfPos(final int pos)297 private int getIndexOfPos(final int pos) { 298 // return the index of the cacheEntry at pos, or the index where it would be inserted if it does not exist. 299 int minIndex=0; 300 int maxIndex=lastIndex(); 301 // using the following complex calculation instead of a binary search is likely to result in less iterations but is slower overall: 302 // int index=(pos*maxIndex)/cache.getSourceLength(); // approximate first guess at index, assuming even distribution of cache entries 303 int index=maxIndex>>1; 304 while (true) { 305 final CacheEntry cacheEntry=array[index]; 306 if (pos>cacheEntry.pos) { 307 final CacheEntry nextCacheEntry=getNext(cacheEntry); 308 if (pos<=nextCacheEntry.pos) return nextCacheEntry.index; 309 minIndex=nextCacheEntry.index; 310 } else if (pos<cacheEntry.pos) { 311 final CacheEntry previousCacheEntry=getPrevious(cacheEntry); 312 if (pos==previousCacheEntry.pos) return previousCacheEntry.index; 313 if (pos>previousCacheEntry.pos) return index; 314 maxIndex=previousCacheEntry.index; 315 } else { 316 return index; 317 } 318 index=(minIndex+maxIndex)>>1; 319 // using the following complex calculation instead of a binary search is likely to result in less iterations but is slower overall: 320 // final int minIndexPos=array[minIndex].pos; 321 // index=((maxIndex-minIndex-1)*(pos-minIndexPos))/(array[maxIndex].pos-minIndexPos)+minIndex+1; // approximate next guess at index, assuming even distribution of cache entries between min and max entries 322 } 323 } 324 getNext(final CacheEntry cacheEntry)325 private CacheEntry getNext(final CacheEntry cacheEntry) { 326 return array[cacheEntry.index+1]; 327 } 328 getPrevious(final CacheEntry cacheEntry)329 private CacheEntry getPrevious(final CacheEntry cacheEntry) { 330 return array[cacheEntry.index-1]; 331 } 332 lastIndex()333 private int lastIndex() { 334 return eof.index; 335 } 336 insert(final CacheEntry cacheEntry)337 private void insert(final CacheEntry cacheEntry) { 338 final int index=cacheEntry.index; 339 if (array.length==size()) doubleCapacity(); 340 for (int i=lastIndex(); i>=index; i--) { 341 final CacheEntry movedCacheEntry=array[i]; 342 array[movedCacheEntry.index=(i+1)]=movedCacheEntry; 343 } 344 array[index]=cacheEntry; 345 } 346 remove(final CacheEntry cacheEntry)347 private void remove(final CacheEntry cacheEntry) { 348 final int lastIndex=lastIndex(); 349 for (int i=cacheEntry.index; i<lastIndex; i++) { 350 final CacheEntry movedCacheEntry=array[i+1]; 351 array[movedCacheEntry.index=i]=movedCacheEntry; 352 } 353 } 354 doubleCapacity()355 private void doubleCapacity() { 356 // assumes size==array.length 357 final CacheEntry[] newArray=new CacheEntry[array.length << 1]; 358 for (int i=lastIndex(); i>=0; i--) newArray[i]=array[i]; 359 array=newArray; 360 } 361 362 private static class CacheEntryMissingInternalError extends AssertionError { CacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache, final String message)363 public CacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache, final String message) { 364 super("INTERNAL ERROR: Inconsistent Cache State for TagType \""+tagType+"\" - "+message+' '+tag.getDebugInfo()+'\n'+subCache); 365 } 366 } 367 368 private static class SourceCacheEntryMissingInternalError extends CacheEntryMissingInternalError { SourceCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache)369 public SourceCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache) { 370 super(tagType,tag,subCache,"cache entry no longer found in source:"); 371 } 372 } 373 374 private static class FoundCacheEntryMissingInternalError extends CacheEntryMissingInternalError { FoundCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache)375 public FoundCacheEntryMissingInternalError(final TagType tagType, final Tag tag, final SubCache subCache) { 376 super(tagType,tag,subCache,"missing cache entry for found tag"); 377 } 378 } 379 380 private final class TagIterator implements Iterator<Tag> { 381 private int i=0; 382 private Tag nextTag; TagIterator()383 public TagIterator() { 384 loadNextTag(); 385 } hasNext()386 public boolean hasNext() { 387 return nextTag!=null; 388 } next()389 public Tag next() { 390 final Tag result=nextTag; 391 loadNextTag(); 392 return result; 393 } remove()394 public void remove() { 395 throw new UnsupportedOperationException(); 396 } loadNextTag()397 private void loadNextTag() { 398 while (++i<=lastIndex() && (nextTag=array[i].tag)==null) {} 399 } 400 } 401 402 private static final class CacheEntry { 403 public int index; 404 public final int pos; 405 public final Tag tag; 406 public boolean previousCached; 407 public boolean nextCached; 408 public boolean removed=false; 409 CacheEntry(final int index, final int pos, final Tag tag, final boolean previousCached, final boolean nextCached)410 public CacheEntry(final int index, final int pos, final Tag tag, final boolean previousCached, final boolean nextCached) { 411 this.index=index; 412 this.pos=pos; 413 this.tag=tag; 414 this.previousCached=previousCached; 415 this.nextCached=nextCached; 416 } 417 isRedundant()418 public boolean isRedundant() { 419 return tag==null && previousCached && nextCached; 420 } 421 toString()422 public String toString() { 423 return pad(index,4)+" "+pad(pos,5)+" "+(previousCached?'|':'-')+' '+(nextCached?'|':'-')+' '+(tag==null ? "null" : tag.getDebugInfo()); 424 } 425 pad(final int n, final int places)426 private String pad(final int n, final int places) { 427 final String nstring=String.valueOf(n); 428 final StringBuilder sb=new StringBuilder(places); 429 for (int i=places-nstring.length(); i>0; i--) sb.append(' '); 430 sb.append(nstring); 431 return sb.toString(); 432 } 433 } 434 } 435