1 /**
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 package org.apache.hadoop.examples.terasort;
19 
20 import java.io.FileOutputStream;
21 import java.io.IOException;
22 import java.io.OutputStream;
23 import java.io.PrintStream;
24 import java.math.BigInteger;
25 import java.util.zip.Checksum;
26 
27 import org.apache.hadoop.util.PureJavaCrc32;
28 
29 /**
30  * A single process data generator for the terasort data. Based on gensort.c
31  * version 1.1 (3 Mar 2009) from Chris Nyberg <chris.nyberg@ordinal.com>.
32  */
33 public class GenSort {
34 
35   /**
36    * Generate a "binary" record suitable for all sort benchmarks *except*
37    * PennySort.
38    */
generateRecord(byte[] recBuf, Unsigned16 rand, Unsigned16 recordNumber)39   static void generateRecord(byte[] recBuf, Unsigned16 rand,
40                                      Unsigned16 recordNumber) {
41     /* generate the 10-byte key using the high 10 bytes of the 128-bit
42      * random number
43      */
44     for(int i=0; i < 10; ++i) {
45       recBuf[i] = rand.getByte(i);
46     }
47 
48     /* add 2 bytes of "break" */
49     recBuf[10] = 0x00;
50     recBuf[11] = 0x11;
51 
52     /* convert the 128-bit record number to 32 bits of ascii hexadecimal
53      * as the next 32 bytes of the record.
54      */
55     for (int i = 0; i < 32; i++) {
56       recBuf[12 + i] = (byte) recordNumber.getHexDigit(i);
57     }
58 
59     /* add 4 bytes of "break" data */
60     recBuf[44] = (byte) 0x88;
61     recBuf[45] = (byte) 0x99;
62     recBuf[46] = (byte) 0xAA;
63     recBuf[47] = (byte) 0xBB;
64 
65     /* add 48 bytes of filler based on low 48 bits of random number */
66     for(int i=0; i < 12; ++i) {
67       recBuf[48+i*4] = recBuf[49+i*4] = recBuf[50+i*4] = recBuf[51+i*4] =
68         (byte) rand.getHexDigit(20 + i);
69     }
70 
71     /* add 4 bytes of "break" data */
72     recBuf[96] = (byte) 0xCC;
73     recBuf[97] = (byte) 0xDD;
74     recBuf[98] = (byte) 0xEE;
75     recBuf[99] = (byte) 0xFF;
76   }
77 
78 
makeBigInteger(long x)79   private static BigInteger makeBigInteger(long x) {
80     byte[] data = new byte[8];
81     for(int i=0; i < 8; ++i) {
82       data[i] = (byte) (x >>> (56 - 8*i));
83     }
84     return new BigInteger(1, data);
85   }
86 
87   private static final BigInteger NINETY_FIVE = new BigInteger("95");
88 
89   /**
90    * Generate an ascii record suitable for all sort benchmarks including
91    * PennySort.
92    */
generateAsciiRecord(byte[] recBuf, Unsigned16 rand, Unsigned16 recordNumber)93   static void generateAsciiRecord(byte[] recBuf, Unsigned16 rand,
94                                   Unsigned16 recordNumber) {
95 
96     /* generate the 10-byte ascii key using mostly the high 64 bits.
97      */
98     long temp = rand.getHigh8();
99     if (temp < 0) {
100       // use biginteger to avoid the negative sign problem
101       BigInteger bigTemp = makeBigInteger(temp);
102       recBuf[0] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue()));
103       temp = bigTemp.divide(NINETY_FIVE).longValue();
104     } else {
105       recBuf[0] = (byte) (' ' + (temp % 95));
106       temp /= 95;
107     }
108     for(int i=1; i < 8; ++i) {
109       recBuf[i] = (byte) (' ' + (temp % 95));
110       temp /= 95;
111     }
112     temp = rand.getLow8();
113     if (temp < 0) {
114       BigInteger bigTemp = makeBigInteger(temp);
115       recBuf[8] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue()));
116       temp = bigTemp.divide(NINETY_FIVE).longValue();
117     } else {
118       recBuf[8] = (byte) (' ' + (temp % 95));
119       temp /= 95;
120     }
121     recBuf[9] = (byte)(' ' + (temp % 95));
122 
123     /* add 2 bytes of "break" */
124     recBuf[10] = ' ';
125     recBuf[11] = ' ';
126 
127     /* convert the 128-bit record number to 32 bits of ascii hexadecimal
128      * as the next 32 bytes of the record.
129      */
130     for (int i = 0; i < 32; i++) {
131       recBuf[12 + i] = (byte) recordNumber.getHexDigit(i);
132     }
133 
134     /* add 2 bytes of "break" data */
135     recBuf[44] = ' ';
136     recBuf[45] = ' ';
137 
138     /* add 52 bytes of filler based on low 48 bits of random number */
139     for(int i=0; i < 13; ++i) {
140       recBuf[46+i*4] = recBuf[47+i*4] = recBuf[48+i*4] = recBuf[49+i*4] =
141         (byte) rand.getHexDigit(19 + i);
142     }
143 
144     /* add 2 bytes of "break" data */
145     recBuf[98] = '\r';	/* nice for Windows */
146     recBuf[99] = '\n';
147 }
148 
149 
usage()150   private static void usage() {
151     PrintStream out = System.out;
152     out.println("usage: gensort [-a] [-c] [-bSTARTING_REC_NUM] NUM_RECS FILE_NAME");
153     out.println("-a        Generate ascii records required for PennySort or JouleSort.");
154     out.println("          These records are also an alternative input for the other");
155     out.println("          sort benchmarks.  Without this flag, binary records will be");
156     out.println("          generated that contain the highest density of randomness in");
157     out.println("          the 10-byte key.");
158     out.println( "-c        Calculate the sum of the crc32 checksums of each of the");
159     out.println("          generated records and send it to standard error.");
160     out.println("-bN       Set the beginning record generated to N. By default the");
161     out.println("          first record generated is record 0.");
162     out.println("NUM_RECS  The number of sequential records to generate.");
163     out.println("FILE_NAME The name of the file to write the records to.\n");
164     out.println("Example 1 - to generate 1000000 ascii records starting at record 0 to");
165     out.println("the file named \"pennyinput\":");
166     out.println("    gensort -a 1000000 pennyinput\n");
167     out.println("Example 2 - to generate 1000 binary records beginning with record 2000");
168     out.println("to the file named \"partition2\":");
169     out.println("    gensort -b2000 1000 partition2");
170     System.exit(1);
171   }
172 
173 
outputRecords(OutputStream out, boolean useAscii, Unsigned16 firstRecordNumber, Unsigned16 recordsToGenerate, Unsigned16 checksum )174   public static void outputRecords(OutputStream out,
175                                    boolean useAscii,
176                                    Unsigned16 firstRecordNumber,
177                                    Unsigned16 recordsToGenerate,
178                                    Unsigned16 checksum
179                                    ) throws IOException {
180     byte[] row = new byte[100];
181     Unsigned16 recordNumber = new Unsigned16(firstRecordNumber);
182     Unsigned16 lastRecordNumber = new Unsigned16(firstRecordNumber);
183     Checksum crc = new PureJavaCrc32();
184     Unsigned16 tmp = new Unsigned16();
185     lastRecordNumber.add(recordsToGenerate);
186     Unsigned16 ONE = new Unsigned16(1);
187     Unsigned16 rand = Random16.skipAhead(firstRecordNumber);
188     while (!recordNumber.equals(lastRecordNumber)) {
189       Random16.nextRand(rand);
190       if (useAscii) {
191         generateAsciiRecord(row, rand, recordNumber);
192       } else {
193         generateRecord(row, rand, recordNumber);
194       }
195       if (checksum != null) {
196         crc.reset();
197         crc.update(row, 0, row.length);
198         tmp.set(crc.getValue());
199         checksum.add(tmp);
200       }
201       recordNumber.add(ONE);
202       out.write(row);
203     }
204   }
205 
main(String[] args)206   public static void main(String[] args) throws Exception {
207     Unsigned16 startingRecord = new Unsigned16();
208     Unsigned16 numberOfRecords;
209     OutputStream out;
210     boolean useAscii = false;
211     Unsigned16 checksum = null;
212 
213     int i;
214     for(i=0; i < args.length; ++i) {
215       String arg = args[i];
216       int argLength = arg.length();
217       if (argLength >= 1 && arg.charAt(0) == '-') {
218         if (argLength < 2) {
219           usage();
220         }
221         switch (arg.charAt(1)) {
222         case 'a':
223           useAscii = true;
224           break;
225         case 'b':
226           startingRecord = Unsigned16.fromDecimal(arg.substring(2));
227           break;
228         case 'c':
229           checksum = new Unsigned16();
230           break;
231         default:
232           usage();
233         }
234       } else {
235         break;
236       }
237     }
238     if (args.length - i != 2) {
239       usage();
240     }
241     numberOfRecords = Unsigned16.fromDecimal(args[i]);
242     out = new FileOutputStream(args[i+1]);
243 
244     outputRecords(out, useAscii, startingRecord, numberOfRecords, checksum);
245     out.close();
246     if (checksum != null) {
247       System.out.println(checksum);
248     }
249   }
250 
251 }
252