From f4000c60e008ad3f222239fd528b37d0acffa6c3 Mon Sep 17 00:00:00 2001 From: Johannes Theiner Date: Fri, 1 Jan 2021 11:34:06 +0100 Subject: [PATCH] add spark stuff --- spark/flattened.py | 15 +++++++++++++++ spark/keytastic.py | 17 +++++++++++++++++ spark/lambda_lengths.py | 21 +++++++++++++++++++++ spark/plural.py | 17 +++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100755 spark/flattened.py create mode 100755 spark/keytastic.py create mode 100755 spark/lambda_lengths.py create mode 100755 spark/plural.py diff --git a/spark/flattened.py b/spark/flattened.py new file mode 100755 index 0000000..70be15c --- /dev/null +++ b/spark/flattened.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +from pyspark import SparkContext, SparkConf +sc = SparkContext() + +#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file. +pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt') + +pride_words_try = pride_rdd.flatMap(lambda line: line.split()) + +print(pride_words_try.take(4)) + +pride_pairs = pride_words_try.map(lambda x: (x, 1)) + +print(pride_pairs.take(10)) diff --git a/spark/keytastic.py b/spark/keytastic.py new file mode 100755 index 0000000..c775883 --- /dev/null +++ b/spark/keytastic.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +from pyspark import SparkContext, SparkConf +sc = SparkContext() + +#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file. +pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt') + +pride_words_try = pride_rdd.flatMap(lambda line: line.split()) + +pride_pairs = pride_words_try.map(lambda x: (x, 1)) + +word_counts = pride_pairs.reduceByKey(lambda x, y: x + y) + +top10_words = word_counts.takeOrdered(10, key=lambda p: -p[1]) + +print(top10_words) diff --git a/spark/lambda_lengths.py b/spark/lambda_lengths.py new file mode 100755 index 0000000..3265083 --- /dev/null +++ b/spark/lambda_lengths.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +from pyspark import SparkContext, SparkConf + + +def make_plural(word): + return word + "s" + + +sc = SparkContext() +animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole'] + +animal_rdd = sc.parallelize(animal_list, 2) + +lambda_plural_rdd = animal_rdd.map(lambda x: x + "s") + +print(lambda_plural_rdd.collect()) + +word_lengths = animal_rdd.map(lambda x: len(x)) + +print(word_lengths.collect()) diff --git a/spark/plural.py b/spark/plural.py new file mode 100755 index 0000000..6574259 --- /dev/null +++ b/spark/plural.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +from pyspark import SparkContext, SparkConf + + +def make_plural(word): + return word + "s" + + +sc = SparkContext() +animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole'] + +animal_rdd = sc.parallelize(animal_list, 2) + +plural_rdd = animal_rdd.map(make_plural) + +print(plural_rdd.collect())