diff --git a/gutenberg/Gemfile b/gutenberg/Gemfile new file mode 100644 index 0000000..e688475 --- /dev/null +++ b/gutenberg/Gemfile @@ -0,0 +1,3 @@ +source 'https://rubygems.org' + +gem 'pry-byebug' \ No newline at end of file diff --git a/README.md b/gutenberg/README.md similarity index 100% rename from README.md rename to gutenberg/README.md diff --git a/data/sample/astronomy/test.txt b/gutenberg/data/sample/astronomy/test.txt similarity index 100% rename from data/sample/astronomy/test.txt rename to gutenberg/data/sample/astronomy/test.txt diff --git a/data/sample/philosophy/test.txt b/gutenberg/data/sample/philosophy/test.txt similarity index 100% rename from data/sample/philosophy/test.txt rename to gutenberg/data/sample/philosophy/test.txt diff --git a/data/sample/physics/test.txt b/gutenberg/data/sample/physics/test.txt similarity index 100% rename from data/sample/physics/test.txt rename to gutenberg/data/sample/physics/test.txt diff --git a/data/sample/religion/test.txt b/gutenberg/data/sample/religion/test.txt similarity index 100% rename from data/sample/religion/test.txt rename to gutenberg/data/sample/religion/test.txt diff --git a/data/stopwords.txt b/gutenberg/data/stopwords.txt similarity index 94% rename from data/stopwords.txt rename to gutenberg/data/stopwords.txt index 7336c7c..effee63 100644 --- a/data/stopwords.txt +++ b/gutenberg/data/stopwords.txt @@ -1 +1 @@ -a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg +a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,one,out,more,now,first,two,very,such,same,shall,upon,before,therefore,great,made,even,same,work,make,being,through,here,way,true,see,time,those,place,much,without,body,whole,another,thus,set,new,given,both,above,well,part,between,end,order,each,form,gutenberg, diff --git a/data/test/archeology/22153-0.txt b/gutenberg/data/test/archeology/22153-0.txt similarity index 100% rename from data/test/archeology/22153-0.txt rename to gutenberg/data/test/archeology/22153-0.txt diff --git a/data/test/archeology/pg13575.txt b/gutenberg/data/test/archeology/pg13575.txt similarity index 100% rename from data/test/archeology/pg13575.txt rename to gutenberg/data/test/archeology/pg13575.txt diff --git a/data/test/archeology/pg17606.txt b/gutenberg/data/test/archeology/pg17606.txt similarity index 100% rename from data/test/archeology/pg17606.txt rename to gutenberg/data/test/archeology/pg17606.txt diff --git a/data/test/archeology/pg17987.txt b/gutenberg/data/test/archeology/pg17987.txt similarity index 100% rename from data/test/archeology/pg17987.txt rename to gutenberg/data/test/archeology/pg17987.txt diff --git a/data/test/archeology/pg18931.txt b/gutenberg/data/test/archeology/pg18931.txt similarity index 100% rename from data/test/archeology/pg18931.txt rename to gutenberg/data/test/archeology/pg18931.txt diff --git a/data/test/archeology/pg19921.txt b/gutenberg/data/test/archeology/pg19921.txt similarity index 100% rename from data/test/archeology/pg19921.txt rename to gutenberg/data/test/archeology/pg19921.txt diff --git a/data/test/archeology/pg23691.txt b/gutenberg/data/test/archeology/pg23691.txt similarity index 100% rename from data/test/archeology/pg23691.txt rename to gutenberg/data/test/archeology/pg23691.txt diff --git a/data/test/archeology/pg4248.txt b/gutenberg/data/test/archeology/pg4248.txt similarity index 100% rename from data/test/archeology/pg4248.txt rename to gutenberg/data/test/archeology/pg4248.txt diff --git a/data/test/astronomy/22157-0.txt b/gutenberg/data/test/astronomy/22157-0.txt similarity index 100% rename from data/test/astronomy/22157-0.txt rename to gutenberg/data/test/astronomy/22157-0.txt diff --git a/data/test/astronomy/8hsrs10u.txt b/gutenberg/data/test/astronomy/8hsrs10u.txt similarity index 100% rename from data/test/astronomy/8hsrs10u.txt rename to gutenberg/data/test/astronomy/8hsrs10u.txt diff --git a/data/test/astronomy/pg16767.txt b/gutenberg/data/test/astronomy/pg16767.txt similarity index 100% rename from data/test/astronomy/pg16767.txt rename to gutenberg/data/test/astronomy/pg16767.txt diff --git a/data/test/astronomy/pg25267.txt b/gutenberg/data/test/astronomy/pg25267.txt similarity index 100% rename from data/test/astronomy/pg25267.txt rename to gutenberg/data/test/astronomy/pg25267.txt diff --git a/data/test/astronomy/pg27477.txt b/gutenberg/data/test/astronomy/pg27477.txt similarity index 100% rename from data/test/astronomy/pg27477.txt rename to gutenberg/data/test/astronomy/pg27477.txt diff --git a/data/test/astronomy/pg28570.txt b/gutenberg/data/test/astronomy/pg28570.txt similarity index 100% rename from data/test/astronomy/pg28570.txt rename to gutenberg/data/test/astronomy/pg28570.txt diff --git a/data/test/astronomy/pg4065.txt b/gutenberg/data/test/astronomy/pg4065.txt similarity index 100% rename from data/test/astronomy/pg4065.txt rename to gutenberg/data/test/astronomy/pg4065.txt diff --git a/data/test/philosophy/11100-0.txt b/gutenberg/data/test/philosophy/11100-0.txt similarity index 100% rename from data/test/philosophy/11100-0.txt rename to gutenberg/data/test/philosophy/11100-0.txt diff --git a/data/test/philosophy/pg13316.txt b/gutenberg/data/test/philosophy/pg13316.txt similarity index 100% rename from data/test/philosophy/pg13316.txt rename to gutenberg/data/test/philosophy/pg13316.txt diff --git a/data/test/philosophy/pg1497.txt b/gutenberg/data/test/philosophy/pg1497.txt similarity index 100% rename from data/test/philosophy/pg1497.txt rename to gutenberg/data/test/philosophy/pg1497.txt diff --git a/data/test/philosophy/pg16712.txt b/gutenberg/data/test/philosophy/pg16712.txt similarity index 100% rename from data/test/philosophy/pg16712.txt rename to gutenberg/data/test/philosophy/pg16712.txt diff --git a/data/test/philosophy/pg22283.txt b/gutenberg/data/test/philosophy/pg22283.txt similarity index 100% rename from data/test/philosophy/pg22283.txt rename to gutenberg/data/test/philosophy/pg22283.txt diff --git a/data/test/philosophy/pg5827.txt b/gutenberg/data/test/philosophy/pg5827.txt similarity index 100% rename from data/test/philosophy/pg5827.txt rename to gutenberg/data/test/philosophy/pg5827.txt diff --git a/data/test/religion/pg15836.txt b/gutenberg/data/test/religion/pg15836.txt similarity index 100% rename from data/test/religion/pg15836.txt rename to gutenberg/data/test/religion/pg15836.txt diff --git a/data/test/religion/pg21190.txt b/gutenberg/data/test/religion/pg21190.txt similarity index 100% rename from data/test/religion/pg21190.txt rename to gutenberg/data/test/religion/pg21190.txt diff --git a/data/test/religion/pg7883.txt b/gutenberg/data/test/religion/pg7883.txt similarity index 100% rename from data/test/religion/pg7883.txt rename to gutenberg/data/test/religion/pg7883.txt diff --git a/data/test/religion/pg8070.txt b/gutenberg/data/test/religion/pg8070.txt similarity index 100% rename from data/test/religion/pg8070.txt rename to gutenberg/data/test/religion/pg8070.txt diff --git a/data/test/religion/pg8200.txt b/gutenberg/data/test/religion/pg8200.txt similarity index 100% rename from data/test/religion/pg8200.txt rename to gutenberg/data/test/religion/pg8200.txt diff --git a/data/training/archeology/17170-0.txt b/gutenberg/data/training/archeology/17170-0.txt similarity index 100% rename from data/training/archeology/17170-0.txt rename to gutenberg/data/training/archeology/17170-0.txt diff --git a/data/training/archeology/18206-0.txt b/gutenberg/data/training/archeology/18206-0.txt similarity index 100% rename from data/training/archeology/18206-0.txt rename to gutenberg/data/training/archeology/18206-0.txt diff --git a/data/training/archeology/19606-0.txt b/gutenberg/data/training/archeology/19606-0.txt similarity index 100% rename from data/training/archeology/19606-0.txt rename to gutenberg/data/training/archeology/19606-0.txt diff --git a/data/training/archeology/19856-0.txt b/gutenberg/data/training/archeology/19856-0.txt similarity index 100% rename from data/training/archeology/19856-0.txt rename to gutenberg/data/training/archeology/19856-0.txt diff --git a/data/training/archeology/19913-0.txt b/gutenberg/data/training/archeology/19913-0.txt similarity index 100% rename from data/training/archeology/19913-0.txt rename to gutenberg/data/training/archeology/19913-0.txt diff --git a/data/training/archeology/20153-0.txt b/gutenberg/data/training/archeology/20153-0.txt similarity index 100% rename from data/training/archeology/20153-0.txt rename to gutenberg/data/training/archeology/20153-0.txt diff --git a/data/training/archeology/pg14400.txt b/gutenberg/data/training/archeology/pg14400.txt similarity index 100% rename from data/training/archeology/pg14400.txt rename to gutenberg/data/training/archeology/pg14400.txt diff --git a/data/training/archeology/pg17321.txt b/gutenberg/data/training/archeology/pg17321.txt similarity index 100% rename from data/training/archeology/pg17321.txt rename to gutenberg/data/training/archeology/pg17321.txt diff --git a/data/training/archeology/pg18184.txt b/gutenberg/data/training/archeology/pg18184.txt similarity index 100% rename from data/training/archeology/pg18184.txt rename to gutenberg/data/training/archeology/pg18184.txt diff --git a/data/training/archeology/pg19115.txt b/gutenberg/data/training/archeology/pg19115.txt similarity index 100% rename from data/training/archeology/pg19115.txt rename to gutenberg/data/training/archeology/pg19115.txt diff --git a/data/training/archeology/pg19723.txt b/gutenberg/data/training/archeology/pg19723.txt similarity index 100% rename from data/training/archeology/pg19723.txt rename to gutenberg/data/training/archeology/pg19723.txt diff --git a/data/training/archeology/pg19953.txt b/gutenberg/data/training/archeology/pg19953.txt similarity index 100% rename from data/training/archeology/pg19953.txt rename to gutenberg/data/training/archeology/pg19953.txt diff --git a/data/training/archeology/pg24505.txt b/gutenberg/data/training/archeology/pg24505.txt similarity index 100% rename from data/training/archeology/pg24505.txt rename to gutenberg/data/training/archeology/pg24505.txt diff --git a/data/training/archeology/pg24654.txt b/gutenberg/data/training/archeology/pg24654.txt similarity index 100% rename from data/training/archeology/pg24654.txt rename to gutenberg/data/training/archeology/pg24654.txt diff --git a/data/training/archeology/pg6462.txt b/gutenberg/data/training/archeology/pg6462.txt similarity index 100% rename from data/training/archeology/pg6462.txt rename to gutenberg/data/training/archeology/pg6462.txt diff --git a/data/training/astronomy/19103-0.txt b/gutenberg/data/training/astronomy/19103-0.txt similarity index 100% rename from data/training/astronomy/19103-0.txt rename to gutenberg/data/training/astronomy/19103-0.txt diff --git a/data/training/astronomy/20769-0.txt b/gutenberg/data/training/astronomy/20769-0.txt similarity index 100% rename from data/training/astronomy/20769-0.txt rename to gutenberg/data/training/astronomy/20769-0.txt diff --git a/data/training/astronomy/28536-0.txt b/gutenberg/data/training/astronomy/28536-0.txt similarity index 100% rename from data/training/astronomy/28536-0.txt rename to gutenberg/data/training/astronomy/28536-0.txt diff --git a/data/training/astronomy/pg15620.txt b/gutenberg/data/training/astronomy/pg15620.txt similarity index 100% rename from data/training/astronomy/pg15620.txt rename to gutenberg/data/training/astronomy/pg15620.txt diff --git a/data/training/astronomy/pg15636.txt b/gutenberg/data/training/astronomy/pg15636.txt similarity index 100% rename from data/training/astronomy/pg15636.txt rename to gutenberg/data/training/astronomy/pg15636.txt diff --git a/data/training/astronomy/pg18431.txt b/gutenberg/data/training/astronomy/pg18431.txt similarity index 100% rename from data/training/astronomy/pg18431.txt rename to gutenberg/data/training/astronomy/pg18431.txt diff --git a/data/training/astronomy/pg25992.txt b/gutenberg/data/training/astronomy/pg25992.txt similarity index 100% rename from data/training/astronomy/pg25992.txt rename to gutenberg/data/training/astronomy/pg25992.txt diff --git a/data/training/astronomy/pg26556.txt b/gutenberg/data/training/astronomy/pg26556.txt similarity index 100% rename from data/training/astronomy/pg26556.txt rename to gutenberg/data/training/astronomy/pg26556.txt diff --git a/data/training/astronomy/pg28247.txt b/gutenberg/data/training/astronomy/pg28247.txt similarity index 100% rename from data/training/astronomy/pg28247.txt rename to gutenberg/data/training/astronomy/pg28247.txt diff --git a/data/training/astronomy/pg28434.txt b/gutenberg/data/training/astronomy/pg28434.txt similarity index 100% rename from data/training/astronomy/pg28434.txt rename to gutenberg/data/training/astronomy/pg28434.txt diff --git a/data/training/astronomy/pg28613.txt b/gutenberg/data/training/astronomy/pg28613.txt similarity index 100% rename from data/training/astronomy/pg28613.txt rename to gutenberg/data/training/astronomy/pg28613.txt diff --git a/data/training/astronomy/pg28752.txt b/gutenberg/data/training/astronomy/pg28752.txt similarity index 100% rename from data/training/astronomy/pg28752.txt rename to gutenberg/data/training/astronomy/pg28752.txt diff --git a/data/training/astronomy/pg35744.txt b/gutenberg/data/training/astronomy/pg35744.txt similarity index 100% rename from data/training/astronomy/pg35744.txt rename to gutenberg/data/training/astronomy/pg35744.txt diff --git a/data/training/astronomy/pg35937.txt b/gutenberg/data/training/astronomy/pg35937.txt similarity index 100% rename from data/training/astronomy/pg35937.txt rename to gutenberg/data/training/astronomy/pg35937.txt diff --git a/data/training/astronomy/pg6630.txt b/gutenberg/data/training/astronomy/pg6630.txt similarity index 100% rename from data/training/astronomy/pg6630.txt rename to gutenberg/data/training/astronomy/pg6630.txt diff --git a/data/training/philosophy/25110-0.txt b/gutenberg/data/training/philosophy/25110-0.txt similarity index 100% rename from data/training/philosophy/25110-0.txt rename to gutenberg/data/training/philosophy/25110-0.txt diff --git a/data/training/philosophy/25447-0.txt b/gutenberg/data/training/philosophy/25447-0.txt similarity index 100% rename from data/training/philosophy/25447-0.txt rename to gutenberg/data/training/philosophy/25447-0.txt diff --git a/data/training/philosophy/35722-0.txt b/gutenberg/data/training/philosophy/35722-0.txt similarity index 100% rename from data/training/philosophy/35722-0.txt rename to gutenberg/data/training/philosophy/35722-0.txt diff --git a/data/training/philosophy/pg10214.txt b/gutenberg/data/training/philosophy/pg10214.txt similarity index 100% rename from data/training/philosophy/pg10214.txt rename to gutenberg/data/training/philosophy/pg10214.txt diff --git a/data/training/philosophy/pg10378.txt b/gutenberg/data/training/philosophy/pg10378.txt similarity index 100% rename from data/training/philosophy/pg10378.txt rename to gutenberg/data/training/philosophy/pg10378.txt diff --git a/data/training/philosophy/pg14357.txt b/gutenberg/data/training/philosophy/pg14357.txt similarity index 100% rename from data/training/philosophy/pg14357.txt rename to gutenberg/data/training/philosophy/pg14357.txt diff --git a/data/training/philosophy/pg16406.txt b/gutenberg/data/training/philosophy/pg16406.txt similarity index 100% rename from data/training/philosophy/pg16406.txt rename to gutenberg/data/training/philosophy/pg16406.txt diff --git a/data/training/philosophy/pg17556.txt b/gutenberg/data/training/philosophy/pg17556.txt similarity index 100% rename from data/training/philosophy/pg17556.txt rename to gutenberg/data/training/philosophy/pg17556.txt diff --git a/data/training/philosophy/pg1998.txt b/gutenberg/data/training/philosophy/pg1998.txt similarity index 100% rename from data/training/philosophy/pg1998.txt rename to gutenberg/data/training/philosophy/pg1998.txt diff --git a/data/training/philosophy/pg20500.txt b/gutenberg/data/training/philosophy/pg20500.txt similarity index 100% rename from data/training/philosophy/pg20500.txt rename to gutenberg/data/training/philosophy/pg20500.txt diff --git a/data/training/philosophy/pg21565.txt b/gutenberg/data/training/philosophy/pg21565.txt similarity index 100% rename from data/training/philosophy/pg21565.txt rename to gutenberg/data/training/philosophy/pg21565.txt diff --git a/data/training/philosophy/pg2412.txt b/gutenberg/data/training/philosophy/pg2412.txt similarity index 100% rename from data/training/philosophy/pg2412.txt rename to gutenberg/data/training/philosophy/pg2412.txt diff --git a/data/training/philosophy/pg29530.txt b/gutenberg/data/training/philosophy/pg29530.txt similarity index 100% rename from data/training/philosophy/pg29530.txt rename to gutenberg/data/training/philosophy/pg29530.txt diff --git a/data/training/philosophy/pg4754.txt b/gutenberg/data/training/philosophy/pg4754.txt similarity index 100% rename from data/training/philosophy/pg4754.txt rename to gutenberg/data/training/philosophy/pg4754.txt diff --git a/data/training/philosophy/pg5637.txt b/gutenberg/data/training/philosophy/pg5637.txt similarity index 100% rename from data/training/philosophy/pg5637.txt rename to gutenberg/data/training/philosophy/pg5637.txt diff --git a/data/training/philosophy/pg7370.txt b/gutenberg/data/training/philosophy/pg7370.txt similarity index 100% rename from data/training/philosophy/pg7370.txt rename to gutenberg/data/training/philosophy/pg7370.txt diff --git a/data/training/philosophy/pg7514.txt b/gutenberg/data/training/philosophy/pg7514.txt similarity index 100% rename from data/training/philosophy/pg7514.txt rename to gutenberg/data/training/philosophy/pg7514.txt diff --git a/data/training/religion/26881-0.txt b/gutenberg/data/training/religion/26881-0.txt similarity index 100% rename from data/training/religion/26881-0.txt rename to gutenberg/data/training/religion/26881-0.txt diff --git a/data/training/religion/pg13601.txt b/gutenberg/data/training/religion/pg13601.txt similarity index 100% rename from data/training/religion/pg13601.txt rename to gutenberg/data/training/religion/pg13601.txt diff --git a/data/training/religion/pg15185.txt b/gutenberg/data/training/religion/pg15185.txt similarity index 100% rename from data/training/religion/pg15185.txt rename to gutenberg/data/training/religion/pg15185.txt diff --git a/data/training/religion/pg1549.txt b/gutenberg/data/training/religion/pg1549.txt similarity index 100% rename from data/training/religion/pg1549.txt rename to gutenberg/data/training/religion/pg1549.txt diff --git a/data/training/religion/pg19950.txt b/gutenberg/data/training/religion/pg19950.txt similarity index 100% rename from data/training/religion/pg19950.txt rename to gutenberg/data/training/religion/pg19950.txt diff --git a/data/training/religion/pg21872.txt b/gutenberg/data/training/religion/pg21872.txt similarity index 100% rename from data/training/religion/pg21872.txt rename to gutenberg/data/training/religion/pg21872.txt diff --git a/data/training/religion/pg30.txt b/gutenberg/data/training/religion/pg30.txt similarity index 100% rename from data/training/religion/pg30.txt rename to gutenberg/data/training/religion/pg30.txt diff --git a/data/training/religion/pg7069.txt b/gutenberg/data/training/religion/pg7069.txt similarity index 100% rename from data/training/religion/pg7069.txt rename to gutenberg/data/training/religion/pg7069.txt diff --git a/data/training/religion/pg7351.txt b/gutenberg/data/training/religion/pg7351.txt similarity index 100% rename from data/training/religion/pg7351.txt rename to gutenberg/data/training/religion/pg7351.txt diff --git a/data/training/religion/pg7925.txt b/gutenberg/data/training/religion/pg7925.txt similarity index 100% rename from data/training/religion/pg7925.txt rename to gutenberg/data/training/religion/pg7925.txt diff --git a/data/training/religion/pg8068.txt b/gutenberg/data/training/religion/pg8068.txt similarity index 100% rename from data/training/religion/pg8068.txt rename to gutenberg/data/training/religion/pg8068.txt diff --git a/data/training/religion/pg8069.txt b/gutenberg/data/training/religion/pg8069.txt similarity index 100% rename from data/training/religion/pg8069.txt rename to gutenberg/data/training/religion/pg8069.txt diff --git a/data/training/religion/pg8071.txt b/gutenberg/data/training/religion/pg8071.txt similarity index 100% rename from data/training/religion/pg8071.txt rename to gutenberg/data/training/religion/pg8071.txt diff --git a/data/training/religion/pg8381.txt b/gutenberg/data/training/religion/pg8381.txt similarity index 100% rename from data/training/religion/pg8381.txt rename to gutenberg/data/training/religion/pg8381.txt diff --git a/data/training/religion/pg8397.txt b/gutenberg/data/training/religion/pg8397.txt similarity index 100% rename from data/training/religion/pg8397.txt rename to gutenberg/data/training/religion/pg8397.txt diff --git a/gutenberg.rb b/gutenberg/gutenberg.rb similarity index 97% rename from gutenberg.rb rename to gutenberg/gutenberg.rb index 84d20f6..8163c59 100644 --- a/gutenberg.rb +++ b/gutenberg/gutenberg.rb @@ -1,5 +1,6 @@ require_relative 'lib/simple_predictor' require_relative 'lib/complex_predictor' +require 'pry-byebug' def run!(predictor_klass, opts={}) puts "+----------------------------------------------------+" @@ -15,7 +16,6 @@ def run!(predictor_klass, opts={}) start_time = Time.now predictor.train! puts "Training took #{Time.now - start_time} seconds." - puts "Predicting..." start_time = Time.now accuracy = predictor.predict_test_set(opts) diff --git a/gutenberg/lib/complex_predictor.rb b/gutenberg/lib/complex_predictor.rb new file mode 100644 index 0000000..934a08a --- /dev/null +++ b/gutenberg/lib/complex_predictor.rb @@ -0,0 +1,80 @@ +require_relative 'predictor' + +class ComplexPredictor < Predictor + # Public: Trains the predictor on books in our dataset. This method is called + # before the predict() method is called. + # + # Returns nothing. + def train! + @data = Hash.new() + + get_categories(@data, @all_books) + compile_words_by_category(@data, @all_books) + build_top_words_hash_for_each_category(@data, 50) + end + + def build_top_words_hash_for_each_category(compiled_words_by_category_hash, number_of_pop_words) + @data.each do |category, books| + histogram = build_histogram(books) + top_words = find_top_words(histogram, number_of_pop_words) + @data[category] = top_words + end + end + + def get_categories(empty_hash, given_books) + given_books.keys.each {|key| empty_hash[key] = []} + end + + def compile_words_by_category(categories_hash, given_books) + given_books.each do |category, books| + books.each do |book| + book[1].each do |word| + categories_hash[category] << word + end + end + end + end + + def build_histogram(array) + histogram = Hash.new(0) + + array.each do |word| + if good_token?(word) + histogram[word] += 1 + end + end + + histogram + end + + def find_top_words(histogram, number_of_words) + sorted = histogram.sort_by {|k, v| v} + Hash[sorted[-number_of_words..-1]] + end + + + # Public: Predicts category. + # + # tokens - A list of tokens (words). + # + # Returns a category. + def predict(tokens) + book_histogram = build_histogram(tokens) + top_words_for_book = find_top_words(book_histogram, 300) + decide_category(top_words_for_book, @data) + end + + def decide_category(top_words, training_data) + count_hash = Hash.new(0) + training_data.each do |category, pop_words| + top_words.each_key do |word| + if pop_words.has_key?(word) + count_hash[category] = count_hash[category] + 1 + end + end + end + count_hash.max_by{|k,v| v}.first + end +end + + diff --git a/lib/predictor.rb b/gutenberg/lib/predictor.rb similarity index 100% rename from lib/predictor.rb rename to gutenberg/lib/predictor.rb diff --git a/lib/simple_predictor.rb b/gutenberg/lib/simple_predictor.rb similarity index 100% rename from lib/simple_predictor.rb rename to gutenberg/lib/simple_predictor.rb diff --git a/utils/stopword_finder.rb b/gutenberg/utils/stopword_finder.rb similarity index 100% rename from utils/stopword_finder.rb rename to gutenberg/utils/stopword_finder.rb diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb deleted file mode 100644 index b8921f3..0000000 --- a/lib/complex_predictor.rb +++ /dev/null @@ -1,22 +0,0 @@ -require_relative 'predictor' - -class ComplexPredictor < Predictor - # Public: Trains the predictor on books in our dataset. This method is called - # before the predict() method is called. - # - # Returns nothing. - def train! - @data = {} - end - - # Public: Predicts category. - # - # tokens - A list of tokens (words). - # - # Returns a category. - def predict(tokens) - # Always predict astronomy, for now. - :astronomy - end -end -