1# 2# Licensed to the Apache Software Foundation (ASF) under one or more 3# contributor license agreements. See the NOTICE file distributed with 4# this work for additional information regarding copyright ownership. 5# The ASF licenses this file to You under the Apache License, Version 2.0 6# (the "License"); you may not use this file except in compliance with 7# the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18from __future__ import print_function 19 20from pyspark import SparkContext 21# $example on$ 22from pyspark.mllib.feature import HashingTF, IDF 23# $example off$ 24 25if __name__ == "__main__": 26 sc = SparkContext(appName="TFIDFExample") # SparkContext 27 28 # $example on$ 29 # Load documents (one per line). 30 documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) 31 32 hashingTF = HashingTF() 33 tf = hashingTF.transform(documents) 34 35 # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: 36 # First to compute the IDF vector and second to scale the term frequencies by IDF. 37 tf.cache() 38 idf = IDF().fit(tf) 39 tfidf = idf.transform(tf) 40 41 # spark.mllib's IDF implementation provides an option for ignoring terms 42 # which occur in less than a minimum number of documents. 43 # In such cases, the IDF for these terms is set to 0. 44 # This feature can be used by passing the minDocFreq value to the IDF constructor. 45 idfIgnore = IDF(minDocFreq=2).fit(tf) 46 tfidfIgnore = idfIgnore.transform(tf) 47 # $example off$ 48 49 print("tfidf:") 50 for each in tfidf.collect(): 51 print(each) 52 53 print("tfidfIgnore:") 54 for each in tfidfIgnore.collect(): 55 print(each) 56 57 sc.stop() 58