Skip to content

Commit ecc4507

Browse files
authored
Merge pull request #196 from aparibocci/max_k_most_frequent_words
Solving the `Top k most frequent words` problem using a max-heap
2 parents cb55645 + 1846c76 commit ecc4507

2 files changed

Lines changed: 64 additions & 0 deletions

File tree

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
require_relative '../data_structures/heaps/max_heap'
2+
3+
##
4+
# This class represents a word count information
5+
# (i.e. how many occurrences for a word).
6+
7+
class WordCount
8+
include Comparable
9+
10+
attr_reader :word
11+
attr_reader :occurrences
12+
13+
def <=>(other)
14+
occurrences <=> other.occurrences
15+
end
16+
17+
def initialize(word, occurrences)
18+
@word = word
19+
@occurrences = occurrences
20+
end
21+
end
22+
23+
##
24+
# Returns the `k` most frequently occurring words, in non-increasing order of occurrence.
25+
# In this context, a word is defined as an element in the provided list.
26+
#
27+
# In case `k` is greater than the number of distinct words, a value of `k` equal
28+
# to the number of distinct words will be considered, instead.
29+
30+
def max_k_most_frequent_words(words, k)
31+
count_by_word = words.tally
32+
heap = MaxHeap.new(count_by_word.map { |w, c| WordCount.new(w, c) })
33+
most_frequent_words = []
34+
[k, count_by_word.size].min.times { most_frequent_words.append(heap.extract_max.word) }
35+
most_frequent_words
36+
end
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
require 'minitest/autorun'
2+
require_relative 'max_k_most_frequent_words'
3+
4+
class TestMaxKMostFrequentWords < Minitest::Test
5+
def test_top_3_frequent_words
6+
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3) == ['c', 'a', 'b']
7+
end
8+
9+
def test_top_2_frequent_words
10+
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2) == ['c', 'a']
11+
end
12+
13+
def test_top_frequent_word
14+
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1) == ['c']
15+
end
16+
17+
def test_no_frequent_word_given_zero_k
18+
assert max_k_most_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0) == []
19+
end
20+
21+
def test_no_frequent_word_given_empty_word_list
22+
assert max_k_most_frequent_words([], 1) == []
23+
end
24+
25+
def test_all_frequent_words_given_k_too_large
26+
assert max_k_most_frequent_words(['a', 'a'], 2) == ['a']
27+
end
28+
end

0 commit comments

Comments
 (0)