Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
1.6 kB
6
Indexable
from mrjob.job import MRJob
import re
import itertools

class MRWordComposedEntirelyOfSmallerWords(MRJob):

    def mapper(self, _, line):
        words = re.findall(r'\b\w+\b', line)
        for word in words:
            yield len(word), word

    def reducer(self, length, words):
        composed_words = []
        for word in words:
            if all(sub_word in words for sub_word in self.get_sub_words(word)):
                composed_words.append(word)
        for composed_word in sorted(composed_words, key=lambda word: len(word), reverse=True):
            yield None, [len(composed_word), composed_word]

    def reducer_2(self, _, words):
        longest_n_words = []
        count_of_words = 0
        for line in words:
            word = line[1]
            longest_n_words.append(word)
            count_of_words += 1
            if len(longest_n_words) > n:
                longest_n_words.pop()
        yield f"The {n} longest words are:", ''
        for word in longest_n_words:
            yield word, ''
        yield f"The count of words is:", count_of_words

    def get_sub_words(self, word):
        sub_words = []
        for i in range(1, len(word)):
            sub_word = word[:i]
            if sub_word != word and sub_word in word[i:]:
                sub_words.append(sub_word)
        return sub_words

if __name__ == '__main__':
    mr_job = MRWordComposedEntirelyOfSmallerWords(args=['<input_file>'])
    with mr_job.make_runner() as runner:
        runner.run()
        for line in runner.stream_output():
            print(mr_job.parse_output_line(line))