1 /** 2 * 3 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) 4 * All rights reserved. 5 * Redistribution and use in source and binary forms, with or 6 * without modification, are permitted provided that the following 7 * conditions are met: 8 * - Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * - Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the distribution. 13 * - Neither the name of the University Catholique de Louvain - UCL 14 * nor the names of its contributors may be used to endorse or 15 * promote products derived from this software without specific prior 16 * written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /** 33 * Licensed to the Apache Software Foundation (ASF) under one 34 * or more contributor license agreements. See the NOTICE file 35 * distributed with this work for additional information 36 * regarding copyright ownership. The ASF licenses this file 37 * to you under the Apache License, Version 2.0 (the 38 * "License"); you may not use this file except in compliance 39 * with the License. You may obtain a copy of the License at 40 * 41 * http://www.apache.org/licenses/LICENSE-2.0 42 * 43 * Unless required by applicable law or agreed to in writing, software 44 * distributed under the License is distributed on an "AS IS" BASIS, 45 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 46 * See the License for the specific language governing permissions and 47 * limitations under the License. 48 */ 49 50 package org.apache.hadoop.util.bloom; 51 52 import java.io.DataInput; 53 import java.io.DataOutput; 54 import java.io.IOException; 55 56 import java.util.BitSet; 57 58 /** 59 * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970. 60 * <p> 61 * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by 62 * the networking research community in the past decade thanks to the bandwidth efficiencies that it 63 * offers for the transmission of set membership information between networked hosts. A sender encodes 64 * the information into a bit vector, the Bloom filter, that is more compact than a conventional 65 * representation. Computation and space costs for construction are linear in the number of elements. 66 * The receiver uses the filter to test whether various elements are members of the set. Though the 67 * filter will occasionally return a false positive, it will never return a false negative. When creating 68 * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. 69 * 70 * <p> 71 * Originally created by 72 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>. 73 * 74 * @see Filter The general behavior of a filter 75 * 76 * @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a> 77 */ 78 public class BloomFilter extends Filter { 79 private static final byte[] bitvalues = new byte[] { 80 (byte)0x01, 81 (byte)0x02, 82 (byte)0x04, 83 (byte)0x08, 84 (byte)0x10, 85 (byte)0x20, 86 (byte)0x40, 87 (byte)0x80 88 }; 89 90 /** The bit vector. */ 91 BitSet bits; 92 93 /** Default constructor - use with readFields */ BloomFilter()94 public BloomFilter() { 95 super(); 96 } 97 98 /** 99 * Constructor 100 * @param vectorSize The vector size of <i>this</i> filter. 101 * @param nbHash The number of hash function to consider. 102 * @param hashType type of the hashing function (see 103 * {@link org.apache.hadoop.util.hash.Hash}). 104 */ BloomFilter(int vectorSize, int nbHash, int hashType)105 public BloomFilter(int vectorSize, int nbHash, int hashType) { 106 super(vectorSize, nbHash, hashType); 107 108 bits = new BitSet(this.vectorSize); 109 } 110 111 @Override add(Key key)112 public void add(Key key) { 113 if(key == null) { 114 throw new NullPointerException("key cannot be null"); 115 } 116 117 int[] h = hash.hash(key); 118 hash.clear(); 119 120 for(int i = 0; i < nbHash; i++) { 121 bits.set(h[i]); 122 } 123 } 124 125 @Override and(Filter filter)126 public void and(Filter filter) { 127 if(filter == null 128 || !(filter instanceof BloomFilter) 129 || filter.vectorSize != this.vectorSize 130 || filter.nbHash != this.nbHash) { 131 throw new IllegalArgumentException("filters cannot be and-ed"); 132 } 133 134 this.bits.and(((BloomFilter) filter).bits); 135 } 136 137 @Override membershipTest(Key key)138 public boolean membershipTest(Key key) { 139 if(key == null) { 140 throw new NullPointerException("key cannot be null"); 141 } 142 143 int[] h = hash.hash(key); 144 hash.clear(); 145 for(int i = 0; i < nbHash; i++) { 146 if(!bits.get(h[i])) { 147 return false; 148 } 149 } 150 return true; 151 } 152 153 @Override not()154 public void not() { 155 bits.flip(0, vectorSize - 1); 156 } 157 158 @Override or(Filter filter)159 public void or(Filter filter) { 160 if(filter == null 161 || !(filter instanceof BloomFilter) 162 || filter.vectorSize != this.vectorSize 163 || filter.nbHash != this.nbHash) { 164 throw new IllegalArgumentException("filters cannot be or-ed"); 165 } 166 bits.or(((BloomFilter) filter).bits); 167 } 168 169 @Override xor(Filter filter)170 public void xor(Filter filter) { 171 if(filter == null 172 || !(filter instanceof BloomFilter) 173 || filter.vectorSize != this.vectorSize 174 || filter.nbHash != this.nbHash) { 175 throw new IllegalArgumentException("filters cannot be xor-ed"); 176 } 177 bits.xor(((BloomFilter) filter).bits); 178 } 179 180 @Override toString()181 public String toString() { 182 return bits.toString(); 183 } 184 185 /** 186 * @return size of the the bloomfilter 187 */ getVectorSize()188 public int getVectorSize() { 189 return this.vectorSize; 190 } 191 192 // Writable 193 194 @Override write(DataOutput out)195 public void write(DataOutput out) throws IOException { 196 super.write(out); 197 byte[] bytes = new byte[getNBytes()]; 198 for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { 199 if (bitIndex == 8) { 200 bitIndex = 0; 201 byteIndex++; 202 } 203 if (bitIndex == 0) { 204 bytes[byteIndex] = 0; 205 } 206 if (bits.get(i)) { 207 bytes[byteIndex] |= bitvalues[bitIndex]; 208 } 209 } 210 out.write(bytes); 211 } 212 213 @Override readFields(DataInput in)214 public void readFields(DataInput in) throws IOException { 215 super.readFields(in); 216 bits = new BitSet(this.vectorSize); 217 byte[] bytes = new byte[getNBytes()]; 218 in.readFully(bytes); 219 for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { 220 if (bitIndex == 8) { 221 bitIndex = 0; 222 byteIndex++; 223 } 224 if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) { 225 bits.set(i); 226 } 227 } 228 } 229 230 /* @return number of bytes needed to hold bit vector */ getNBytes()231 private int getNBytes() { 232 return (vectorSize + 7) / 8; 233 } 234 }//end class 235