add spark stuff

This commit is contained in:
Johannes Theiner 2021-01-01 11:34:06 +01:00
parent 07bd6f6315
commit f4000c60e0
4 changed files with 70 additions and 0 deletions

15
spark/flattened.py Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env python3
from pyspark import SparkContext, SparkConf
sc = SparkContext()
#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
pride_words_try = pride_rdd.flatMap(lambda line: line.split())
print(pride_words_try.take(4))
pride_pairs = pride_words_try.map(lambda x: (x, 1))
print(pride_pairs.take(10))

17
spark/keytastic.py Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python3
from pyspark import SparkContext, SparkConf
sc = SparkContext()
#Create an RDD from pride_and_prejudice.txt where every element is is a line of the file.
pride_rdd = sc.textFile('hdfs://spark-master:9000/shared/pride_and_prejudice.txt')
pride_words_try = pride_rdd.flatMap(lambda line: line.split())
pride_pairs = pride_words_try.map(lambda x: (x, 1))
word_counts = pride_pairs.reduceByKey(lambda x, y: x + y)
top10_words = word_counts.takeOrdered(10, key=lambda p: -p[1])
print(top10_words)

21
spark/lambda_lengths.py Executable file
View File

@ -0,0 +1,21 @@
#!/usr/bin/env python3
from pyspark import SparkContext, SparkConf
def make_plural(word):
return word + "s"
sc = SparkContext()
animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
animal_rdd = sc.parallelize(animal_list, 2)
lambda_plural_rdd = animal_rdd.map(lambda x: x + "s")
print(lambda_plural_rdd.collect())
word_lengths = animal_rdd.map(lambda x: len(x))
print(word_lengths.collect())

17
spark/plural.py Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python3
from pyspark import SparkContext, SparkConf
def make_plural(word):
return word + "s"
sc = SparkContext()
animal_list = ['dog', 'cat', 'rabbit', 'hare', 'deer', 'gull', 'woodpecker', 'mole']
animal_rdd = sc.parallelize(animal_list, 2)
plural_rdd = animal_rdd.map(make_plural)
print(plural_rdd.collect())