From f4000c60e008ad3f222239fd528b37d0acffa6c3 Mon Sep 17 00:00:00 2001
From: Johannes Theiner <kontakt@joethei.xyz>
Date: Fri, 1 Jan 2021 11:34:06 +0100
Subject: [PATCH] add spark stuff

---
 spark/flattened.py      | 15 +++++++++++++++
 spark/keytastic.py      | 17 +++++++++++++++++
 spark/lambda_lengths.py | 21 +++++++++++++++++++++
 spark/plural.py         | 17 +++++++++++++++++
 4 files changed, 70 insertions(+)
 create mode 100755 spark/flattened.py
 create mode 100755 spark/keytastic.py
 create mode 100755 spark/lambda_lengths.py
 create mode 100755 spark/plural.py

diff --git a/spark/flattened.py b/spark/flattened.py
new file mode 100755
index 0000000..70be15c
--- /dev/null
+++ b/spark/flattened.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+sc = SparkContext()
+
+#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
+pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
+
+pride_words_try = pride_rdd.flatMap(lambda line: line.split())
+
+print(pride_words_try.take(4))
+
+pride_pairs = pride_words_try.map(lambda x: (x, 1))
+
+print(pride_pairs.take(10))
diff --git a/spark/keytastic.py b/spark/keytastic.py
new file mode 100755
index 0000000..c775883
--- /dev/null
+++ b/spark/keytastic.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+sc = SparkContext()
+
+#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
+pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
+
+pride_words_try = pride_rdd.flatMap(lambda line: line.split())
+
+pride_pairs = pride_words_try.map(lambda x: (x, 1))
+
+word_counts = pride_pairs.reduceByKey(lambda x, y: x + y)
+
+top10_words = word_counts.takeOrdered(10, key=lambda p: -p[1])
+
+print(top10_words)
diff --git a/spark/lambda_lengths.py b/spark/lambda_lengths.py
new file mode 100755
index 0000000..3265083
--- /dev/null
+++ b/spark/lambda_lengths.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+
+
+def make_plural(word):
+    return word + "s"
+
+
+sc = SparkContext()
+animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
+
+animal_rdd = sc.parallelize(animal_list, 2)
+
+lambda_plural_rdd = animal_rdd.map(lambda x: x + "s")
+
+print(lambda_plural_rdd.collect())
+
+word_lengths = animal_rdd.map(lambda x: len(x))
+
+print(word_lengths.collect())
diff --git a/spark/plural.py b/spark/plural.py
new file mode 100755
index 0000000..6574259
--- /dev/null
+++ b/spark/plural.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from pyspark import SparkContext, SparkConf
+
+
+def make_plural(word):
+    return word + "s"
+
+
+sc = SparkContext()
+animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
+
+animal_rdd = sc.parallelize(animal_list, 2)
+
+plural_rdd = animal_rdd.map(make_plural)
+
+print(plural_rdd.collect())