diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..e955c85 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -23,6 +23,6 @@ def run!(predictor_klass, opts={}) puts "Accuracy: #{accuracy}" end -run!(SimplePredictor) +#run!(SimplePredictor) run!(ComplexPredictor, debug: true) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..8738537 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -1,12 +1,38 @@ require_relative 'predictor' +require 'pry-byebug' class ComplexPredictor < Predictor # Public: Trains the predictor on books in our dataset. This method is called # before the predict() method is called. # # Returns nothing. + #This algorithm gets the top 100 most frequent words above 7 letters that occur across all books + #for the given subject. Next, it compares the words def train! - @data = {} + @subjects = [] + @points = Hash.new(0) + @data= {} + @words = Hash.new(0) + @all_books.each do |category, books| + books.each do |filename, words| + words.each do |word| + if word.length > 6 + @words[word] += 1 + #binding.pry + end + + end + @subjects.push([filename.split("/")[2]]) + @points[category]=0 + words_freq = @words.sort_by{|key, value| value}[-2000..-1] + words_only = words_freq.map{|pair| pair[0]} + @data[category] = words_only + end + end +#create a hash with the word frequency for each one +#check to see how many top 20 words match. just like the min one, the closest match or the one with the most +#similar terms gets selected + end # Public: Predicts category. @@ -14,9 +40,52 @@ def train! # tokens - A list of tokens (words). # # Returns a category. + + #create a new hash with the subject and a corresponding points value field + #run the first test and assign a point to the hash for the proper one + #search the first 500 words for a word match to the subject, which is found inside the file name + #if there is a match, assign a point to the new hash + #additionally you can run the simple version to try to add another point def predict(tokens) - # Always predict astronomy, for now. - :astronomy + @minimum_category = nil + @minimum_distance = 999999999 + @token_words = Hash.new(0) + @points = Hash.new(0) + #gets the top 100 words and compares the difference + tokens.each do |token| + if token.length > 6 + @token_words[token] +=1 + end + end + sorted_token_words = @token_words.sort_by{|key, value| value}[-100..-1] + only_words = sorted_token_words.map{|pair| pair[0]} + @difference = nil + @data.each do |category, word_list| + @difference = @data[category] - only_words + difference_count = @difference.count + if difference_count < @minimum_distance + @points[category] += 1 + @minimum_distance = difference_count + end + #binding.pry + #this seemed like a neat idea but does not impact anything + # @subjects.each do |subject| + # if only_words.include?(subject) + # @points[category]+=1 end + + # end + end + # binding.pry + #iterate through points to get the highest value and pass the key ********** + cat = @points.sort_by{|k,v| v}[-1][0] + cat end end + #compare sorted_token_words to each @data[:category] hash to see how many words match + #calculate matching words or missing words number and store that as minimum difference + #store the subject as minimum subject + #do the next category and if the minimum difference is larger swap out values + #return the minimum subject + # + # Always predict astronomy, for now. diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..fab8c0f 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -25,14 +25,13 @@ def train! # } # } @data = {} - @all_books.each do |category, books| @data[category] = { words: 0, books: 0 } books.each do |filename, tokens| - @data[category][:words] += tokens.count + @data[category][:words] += tokens.length @data[category][:books] += 1 end end @@ -53,7 +52,6 @@ def predict(tokens) minimum_category = nil minimum_distance = 999999999999 - @data.each do |category, counts| average_words_per_book = counts[:words].to_f / counts[:books] difference = (tokens.count - average_words_per_book).abs diff --git a/lib/test.rb b/lib/test.rb new file mode 100644 index 0000000..82d51a5 --- /dev/null +++ b/lib/test.rb @@ -0,0 +1,22 @@ +# win_loss = %W{win loss loss win win win FART FART FART FART FART FART BUTTZ} +# p win_loss +# hashbrowns = Hash.new(0) +# win_loss.each do |item| +# hashbrowns[item] +=1 +# end + +# new = hashbrowns.sort_by{|k,v| v } + +# p words + +# a =[1,2,3,4,5,6,7,8] +# b =[5,6,7,8,9,10,11] + +# x = a-b +# p x.count +#so when I subtract, the new array is unmatched items from the arrays. +#so the new array.length - smaller number the better. if it's smaller than the variable stow it because it +#means that there are more words in common! OH GOD IT'S HAPPENING!!! RUBY IN MY BRAIN. + +stuff = {nick: 3, robert: 1, mike: 7} +p stuff.sort_by{|k,v| v}[-1][0]