from mrjob.step import MRStep
from mrjob.job import MRJob
class MostPopularMovie(MRJob):
def configure_options(self):
super(MostPopularMovie, self).configure_options()
self.add_file_option('--items', help='Path to u.item')
def steps(self):
return [MRStep(mapper=self.mapper_count,reducer_init=self.reducer_init,reducer=self.reducer_count),
MRStep(reducer=self.reducer_most)]
def mapper_count(self, _, line):
rating=line.split("\t")
yield rating[1],1
def reducer_init(self):
self.movieNames = {}
with open("u.item") as f:
for line in f:
fields = line.split('|')
self.movieNames[fields[0]] = unicode(fields[1], errors='ignore')
def reducer_count(self, movie, count_sum):
yield "most",[sum(count_sum),self.movieNames[movie]]
def reducer_most(self,_,freq):
yield "most",max(freq)
if __name__ == '__main__':
MostPopularMovie.run()
from mrjob.job import MRJob
class MRTextInfo(MRJob):
def mapper(self, _, line):
for phrase in line.split('.'):
yield 'phrases', 1
for word in phrase.split():
yield 'words', 1
yield 'characters', len(word)
def reducer(self, key, counts):
yield key, sum(counts)
if __name__ == '__main__':
MRTextInfo.run()