add spark stuff

2021-01-01 11:34:06 +01:00 · 2021-01-01 11:34:06 +01:00 · f4000c60e0
commit f4000c60e0
parent 07bd6f6315
4 changed files with 70 additions and 0 deletions
--- a/spark/flattened.py
+++ b/spark/flattened.py
@ -0,0 +1,15 @@
 #!/usr/bin/env python3
 from pyspark import SparkContext, SparkConf
 sc = SparkContext()
 #Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
 pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
 pride_words_try = pride_rdd.flatMap(lambda line: line.split())
 print(pride_words_try.take(4))
 pride_pairs = pride_words_try.map(lambda x: (x, 1))
 print(pride_pairs.take(10))
--- a/spark/keytastic.py
+++ b/spark/keytastic.py
@ -0,0 +1,17 @@
 #!/usr/bin/env python3
 from pyspark import SparkContext, SparkConf
 sc = SparkContext()
 #Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
 pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
 pride_words_try = pride_rdd.flatMap(lambda line: line.split())
 pride_pairs = pride_words_try.map(lambda x: (x, 1))
 word_counts = pride_pairs.reduceByKey(lambda x, y: x + y)
 top10_words = word_counts.takeOrdered(10, key=lambda p: -p[1])
 print(top10_words)
--- a/spark/lambda_lengths.py
+++ b/spark/lambda_lengths.py
@ -0,0 +1,21 @@
 #!/usr/bin/env python3
 from pyspark import SparkContext, SparkConf
 def make_plural(word):
    return word + "s"
 sc = SparkContext()
 animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
 animal_rdd = sc.parallelize(animal_list, 2)
 lambda_plural_rdd = animal_rdd.map(lambda x: x + "s")
 print(lambda_plural_rdd.collect())
 word_lengths = animal_rdd.map(lambda x: len(x))
 print(word_lengths.collect())
--- a/spark/plural.py
+++ b/spark/plural.py
@ -0,0 +1,17 @@
 #!/usr/bin/env python3
 from pyspark import SparkContext, SparkConf
 def make_plural(word):
    return word + "s"
 sc = SparkContext()
 animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
 animal_rdd = sc.parallelize(animal_list, 2)
 plural_rdd = animal_rdd.map(make_plural)
 print(plural_rdd.collect())