1# 2# Licensed to the Apache Software Foundation (ASF) under one or more 3# contributor license agreements. See the NOTICE file distributed with 4# this work for additional information regarding copyright ownership. 5# The ASF licenses this file to You under the Apache License, Version 2.0 6# (the "License"); you may not use this file except in compliance with 7# the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18""" 19A K-means clustering program using MLlib. 20 21This example requires NumPy (http://www.numpy.org/). 22""" 23from __future__ import print_function 24 25import sys 26 27import numpy as np 28from pyspark import SparkContext 29from pyspark.mllib.clustering import KMeans 30 31 32def parseVector(line): 33 return np.array([float(x) for x in line.split(' ')]) 34 35 36if __name__ == "__main__": 37 if len(sys.argv) != 3: 38 print("Usage: kmeans <file> <k>", file=sys.stderr) 39 exit(-1) 40 sc = SparkContext(appName="KMeans") 41 lines = sc.textFile(sys.argv[1]) 42 data = lines.map(parseVector) 43 k = int(sys.argv[2]) 44 model = KMeans.train(data, k) 45 print("Final centers: " + str(model.clusterCenters)) 46 print("Total Cost: " + str(model.computeCost(data))) 47 sc.stop() 48