df = sc.textFile("dbfs:/FileStore/test.txt")
# below is the text file content
"""
hadoop is fast
hive is sql on hdfs
spark is superfast
spark is awesome
"""
fm=df.flatMap(lambda x: x.split(" ")).map(lambda x: (x,1)).groupByKey().mapValues(sum)
fm.take(20)
Out[8]: [('hadoop', 1), ('is', 4), ('hive', 1), ('hdfs', 1), ('awesome', 1), ('fast', 1), ('sql', 1), ('on', 1), ('spark', 2), ('superfast', 1)]
No comments:
Post a Comment