add spark stuff

2021-01-01 11:34:06 +01:00 · 2021-01-01 11:34:06 +01:00 · f4000c60e0
commit f4000c60e0
parent 07bd6f6315
4 changed files with 70 additions and 0 deletions
--- a/spark/flattened.py
+++ b/spark/flattened.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+sc = SparkContext()
+
+#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
+pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
+
+pride_words_try = pride_rdd.flatMap(lambda line: line.split())
+
+print(pride_words_try.take(4))
+
+pride_pairs = pride_words_try.map(lambda x: (x, 1))
+
+print(pride_pairs.take(10))
--- a/spark/keytastic.py
+++ b/spark/keytastic.py
@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+sc = SparkContext()
+
+#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
+pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
+
+pride_words_try = pride_rdd.flatMap(lambda line: line.split())
+
+pride_pairs = pride_words_try.map(lambda x: (x, 1))
+
+word_counts = pride_pairs.reduceByKey(lambda x, y: x + y)
+
+top10_words = word_counts.takeOrdered(10, key=lambda p: -p[1])
+
+print(top10_words)
--- a/spark/lambda_lengths.py
+++ b/spark/lambda_lengths.py
@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+
+
+def make_plural(word):
+    return word + "s"
+
+
+sc = SparkContext()
+animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
+
+animal_rdd = sc.parallelize(animal_list, 2)
+
+lambda_plural_rdd = animal_rdd.map(lambda x: x + "s")
+
+print(lambda_plural_rdd.collect())
+
+word_lengths = animal_rdd.map(lambda x: len(x))
+
+print(word_lengths.collect())
--- a/spark/plural.py
+++ b/spark/plural.py
@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+
+
+def make_plural(word):
+    return word + "s"
+
+
+sc = SparkContext()
+animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
+
+animal_rdd = sc.parallelize(animal_list, 2)
+
+plural_rdd = animal_rdd.map(make_plural)
+
+print(plural_rdd.collect())