DataScience/spark/flattened.py

16 lines
440 B
Python
Executable File

#!/usr/bin/env python3
from pyspark import SparkContext, SparkConf
sc = SparkContext()
#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
pride_words_try = pride_rdd.flatMap(lambda line: line.split())
print(pride_words_try.take(4))
pride_pairs = pride_words_try.map(lambda x: (x, 1))
print(pride_pairs.take(10))