1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 package org.apache.hadoop.examples.terasort; 19 20 import java.io.FileOutputStream; 21 import java.io.IOException; 22 import java.io.OutputStream; 23 import java.io.PrintStream; 24 import java.math.BigInteger; 25 import java.util.zip.Checksum; 26 27 import org.apache.hadoop.util.PureJavaCrc32; 28 29 /** 30 * A single process data generator for the terasort data. Based on gensort.c 31 * version 1.1 (3 Mar 2009) from Chris Nyberg <chris.nyberg@ordinal.com>. 32 */ 33 public class GenSort { 34 35 /** 36 * Generate a "binary" record suitable for all sort benchmarks *except* 37 * PennySort. 38 */ generateRecord(byte[] recBuf, Unsigned16 rand, Unsigned16 recordNumber)39 static void generateRecord(byte[] recBuf, Unsigned16 rand, 40 Unsigned16 recordNumber) { 41 /* generate the 10-byte key using the high 10 bytes of the 128-bit 42 * random number 43 */ 44 for(int i=0; i < 10; ++i) { 45 recBuf[i] = rand.getByte(i); 46 } 47 48 /* add 2 bytes of "break" */ 49 recBuf[10] = 0x00; 50 recBuf[11] = 0x11; 51 52 /* convert the 128-bit record number to 32 bits of ascii hexadecimal 53 * as the next 32 bytes of the record. 54 */ 55 for (int i = 0; i < 32; i++) { 56 recBuf[12 + i] = (byte) recordNumber.getHexDigit(i); 57 } 58 59 /* add 4 bytes of "break" data */ 60 recBuf[44] = (byte) 0x88; 61 recBuf[45] = (byte) 0x99; 62 recBuf[46] = (byte) 0xAA; 63 recBuf[47] = (byte) 0xBB; 64 65 /* add 48 bytes of filler based on low 48 bits of random number */ 66 for(int i=0; i < 12; ++i) { 67 recBuf[48+i*4] = recBuf[49+i*4] = recBuf[50+i*4] = recBuf[51+i*4] = 68 (byte) rand.getHexDigit(20 + i); 69 } 70 71 /* add 4 bytes of "break" data */ 72 recBuf[96] = (byte) 0xCC; 73 recBuf[97] = (byte) 0xDD; 74 recBuf[98] = (byte) 0xEE; 75 recBuf[99] = (byte) 0xFF; 76 } 77 78 makeBigInteger(long x)79 private static BigInteger makeBigInteger(long x) { 80 byte[] data = new byte[8]; 81 for(int i=0; i < 8; ++i) { 82 data[i] = (byte) (x >>> (56 - 8*i)); 83 } 84 return new BigInteger(1, data); 85 } 86 87 private static final BigInteger NINETY_FIVE = new BigInteger("95"); 88 89 /** 90 * Generate an ascii record suitable for all sort benchmarks including 91 * PennySort. 92 */ generateAsciiRecord(byte[] recBuf, Unsigned16 rand, Unsigned16 recordNumber)93 static void generateAsciiRecord(byte[] recBuf, Unsigned16 rand, 94 Unsigned16 recordNumber) { 95 96 /* generate the 10-byte ascii key using mostly the high 64 bits. 97 */ 98 long temp = rand.getHigh8(); 99 if (temp < 0) { 100 // use biginteger to avoid the negative sign problem 101 BigInteger bigTemp = makeBigInteger(temp); 102 recBuf[0] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue())); 103 temp = bigTemp.divide(NINETY_FIVE).longValue(); 104 } else { 105 recBuf[0] = (byte) (' ' + (temp % 95)); 106 temp /= 95; 107 } 108 for(int i=1; i < 8; ++i) { 109 recBuf[i] = (byte) (' ' + (temp % 95)); 110 temp /= 95; 111 } 112 temp = rand.getLow8(); 113 if (temp < 0) { 114 BigInteger bigTemp = makeBigInteger(temp); 115 recBuf[8] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue())); 116 temp = bigTemp.divide(NINETY_FIVE).longValue(); 117 } else { 118 recBuf[8] = (byte) (' ' + (temp % 95)); 119 temp /= 95; 120 } 121 recBuf[9] = (byte)(' ' + (temp % 95)); 122 123 /* add 2 bytes of "break" */ 124 recBuf[10] = ' '; 125 recBuf[11] = ' '; 126 127 /* convert the 128-bit record number to 32 bits of ascii hexadecimal 128 * as the next 32 bytes of the record. 129 */ 130 for (int i = 0; i < 32; i++) { 131 recBuf[12 + i] = (byte) recordNumber.getHexDigit(i); 132 } 133 134 /* add 2 bytes of "break" data */ 135 recBuf[44] = ' '; 136 recBuf[45] = ' '; 137 138 /* add 52 bytes of filler based on low 48 bits of random number */ 139 for(int i=0; i < 13; ++i) { 140 recBuf[46+i*4] = recBuf[47+i*4] = recBuf[48+i*4] = recBuf[49+i*4] = 141 (byte) rand.getHexDigit(19 + i); 142 } 143 144 /* add 2 bytes of "break" data */ 145 recBuf[98] = '\r'; /* nice for Windows */ 146 recBuf[99] = '\n'; 147 } 148 149 usage()150 private static void usage() { 151 PrintStream out = System.out; 152 out.println("usage: gensort [-a] [-c] [-bSTARTING_REC_NUM] NUM_RECS FILE_NAME"); 153 out.println("-a Generate ascii records required for PennySort or JouleSort."); 154 out.println(" These records are also an alternative input for the other"); 155 out.println(" sort benchmarks. Without this flag, binary records will be"); 156 out.println(" generated that contain the highest density of randomness in"); 157 out.println(" the 10-byte key."); 158 out.println( "-c Calculate the sum of the crc32 checksums of each of the"); 159 out.println(" generated records and send it to standard error."); 160 out.println("-bN Set the beginning record generated to N. By default the"); 161 out.println(" first record generated is record 0."); 162 out.println("NUM_RECS The number of sequential records to generate."); 163 out.println("FILE_NAME The name of the file to write the records to.\n"); 164 out.println("Example 1 - to generate 1000000 ascii records starting at record 0 to"); 165 out.println("the file named \"pennyinput\":"); 166 out.println(" gensort -a 1000000 pennyinput\n"); 167 out.println("Example 2 - to generate 1000 binary records beginning with record 2000"); 168 out.println("to the file named \"partition2\":"); 169 out.println(" gensort -b2000 1000 partition2"); 170 System.exit(1); 171 } 172 173 outputRecords(OutputStream out, boolean useAscii, Unsigned16 firstRecordNumber, Unsigned16 recordsToGenerate, Unsigned16 checksum )174 public static void outputRecords(OutputStream out, 175 boolean useAscii, 176 Unsigned16 firstRecordNumber, 177 Unsigned16 recordsToGenerate, 178 Unsigned16 checksum 179 ) throws IOException { 180 byte[] row = new byte[100]; 181 Unsigned16 recordNumber = new Unsigned16(firstRecordNumber); 182 Unsigned16 lastRecordNumber = new Unsigned16(firstRecordNumber); 183 Checksum crc = new PureJavaCrc32(); 184 Unsigned16 tmp = new Unsigned16(); 185 lastRecordNumber.add(recordsToGenerate); 186 Unsigned16 ONE = new Unsigned16(1); 187 Unsigned16 rand = Random16.skipAhead(firstRecordNumber); 188 while (!recordNumber.equals(lastRecordNumber)) { 189 Random16.nextRand(rand); 190 if (useAscii) { 191 generateAsciiRecord(row, rand, recordNumber); 192 } else { 193 generateRecord(row, rand, recordNumber); 194 } 195 if (checksum != null) { 196 crc.reset(); 197 crc.update(row, 0, row.length); 198 tmp.set(crc.getValue()); 199 checksum.add(tmp); 200 } 201 recordNumber.add(ONE); 202 out.write(row); 203 } 204 } 205 main(String[] args)206 public static void main(String[] args) throws Exception { 207 Unsigned16 startingRecord = new Unsigned16(); 208 Unsigned16 numberOfRecords; 209 OutputStream out; 210 boolean useAscii = false; 211 Unsigned16 checksum = null; 212 213 int i; 214 for(i=0; i < args.length; ++i) { 215 String arg = args[i]; 216 int argLength = arg.length(); 217 if (argLength >= 1 && arg.charAt(0) == '-') { 218 if (argLength < 2) { 219 usage(); 220 } 221 switch (arg.charAt(1)) { 222 case 'a': 223 useAscii = true; 224 break; 225 case 'b': 226 startingRecord = Unsigned16.fromDecimal(arg.substring(2)); 227 break; 228 case 'c': 229 checksum = new Unsigned16(); 230 break; 231 default: 232 usage(); 233 } 234 } else { 235 break; 236 } 237 } 238 if (args.length - i != 2) { 239 usage(); 240 } 241 numberOfRecords = Unsigned16.fromDecimal(args[i]); 242 out = new FileOutputStream(args[i+1]); 243 244 outputRecords(out, useAscii, startingRecord, numberOfRecords, checksum); 245 out.close(); 246 if (checksum != null) { 247 System.out.println(checksum); 248 } 249 } 250 251 } 252