Untitled
unknown
python
2 years ago
1.6 kB
8
Indexable
from mrjob.job import MRJob import re import itertools class MRWordComposedEntirelyOfSmallerWords(MRJob): def mapper(self, _, line): words = re.findall(r'\b\w+\b', line) for word in words: yield len(word), word def reducer(self, length, words): composed_words = [] for word in words: if all(sub_word in words for sub_word in self.get_sub_words(word)): composed_words.append(word) for composed_word in sorted(composed_words, key=lambda word: len(word), reverse=True): yield None, [len(composed_word), composed_word] def reducer_2(self, _, words): longest_n_words = [] count_of_words = 0 for line in words: word = line[1] longest_n_words.append(word) count_of_words += 1 if len(longest_n_words) > n: longest_n_words.pop() yield f"The {n} longest words are:", '' for word in longest_n_words: yield word, '' yield f"The count of words is:", count_of_words def get_sub_words(self, word): sub_words = [] for i in range(1, len(word)): sub_word = word[:i] if sub_word != word and sub_word in word[i:]: sub_words.append(sub_word) return sub_words if __name__ == '__main__': mr_job = MRWordComposedEntirelyOfSmallerWords(args=['<input_file>']) with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): print(mr_job.parse_output_line(line))
Editor is loading...