1#
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements.  See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to You under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License.  You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""
19Randomly generated RDDs.
20"""
21from __future__ import print_function
22
23import sys
24
25from pyspark import SparkContext
26from pyspark.mllib.random import RandomRDDs
27
28
29if __name__ == "__main__":
30    if len(sys.argv) not in [1, 2]:
31        print("Usage: random_rdd_generation", file=sys.stderr)
32        exit(-1)
33
34    sc = SparkContext(appName="PythonRandomRDDGeneration")
35
36    numExamples = 10000  # number of examples to generate
37    fraction = 0.1  # fraction of data to sample
38
39    # Example: RandomRDDs.normalRDD
40    normalRDD = RandomRDDs.normalRDD(sc, numExamples)
41    print('Generated RDD of %d examples sampled from the standard normal distribution'
42          % normalRDD.count())
43    print('  First 5 samples:')
44    for sample in normalRDD.take(5):
45        print('    ' + str(sample))
46    print()
47
48    # Example: RandomRDDs.normalVectorRDD
49    normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
50    print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
51    print('  First 5 samples:')
52    for sample in normalVectorRDD.take(5):
53        print('    ' + str(sample))
54    print()
55
56    sc.stop()
57