from mrjob.job import MRJob
import re
import itertools
class MRWordComposedEntirelyOfSmallerWords(MRJob):
def mapper(self, _, line):
words = re.findall(r'\b\w+\b', line)
for word in words:
yield len(word), word
def reducer(self, length, words):
composed_words = []
for word in words:
if all(sub_word in words for sub_word in self.get_sub_words(word)):
composed_words.append(word)
for composed_word in sorted(composed_words, key=lambda word: len(word), reverse=True):
yield None, [len(composed_word), composed_word]
def reducer_2(self, _, words):
longest_n_words = []
count_of_words = 0
for line in words:
word = line[1]
longest_n_words.append(word)
count_of_words += 1
if len(longest_n_words) > n:
longest_n_words.pop()
yield f"The {n} longest words are:", ''
for word in longest_n_words:
yield word, ''
yield f"The count of words is:", count_of_words
def get_sub_words(self, word):
sub_words = []
for i in range(1, len(word)):
sub_word = word[:i]
if sub_word != word and sub_word in word[i:]:
sub_words.append(sub_word)
return sub_words
if __name__ == '__main__':
mr_job = MRWordComposedEntirelyOfSmallerWords(args=['<input_file>'])
with mr_job.make_runner() as runner:
runner.run()
for line in runner.stream_output():
print(mr_job.parse_output_line(line))