Answered You can hire a professional tutor to get the answer.
#Import SparkContext from pyspark from pyspark import SparkContext sc = SparkContext() from operator import add rdd1 =...
#Import SparkContext from pyspark
from pyspark import SparkContext
sc = SparkContext()
from operator import add
rdd1 = sc.parallelize([("a",1),("b",1),("a",1)])
sorted(rdd1.reduceByKey(add).collect())
!curl -L https://github.com/fivethirtyeight/data/blob/master/daily-show-guests/daily_show_guests.csv -o daily.csv
!head -10 daily.csv
raw = sc.textFile("daily.csv")
raw.take(5)
daily = raw.map(lambda line: line.split(','))
daily.take(5)
# Aggregate total count of visitors per year
tally = daily.map(lambda x: (x[0], 1))
.reduceByKey(lambda x,y: x+y)
print(tally)
# because Spark is lazy we need to perform an action on the RDD
tally.take(tally.count())
Questions: How do I sort the tally by year