From ffcfa78398fd13c0fdba77c6c2441b60ff89c869 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Fri, 24 Jul 2020 22:49:53 -0400 Subject: [PATCH 01/30] remove dead check bug fix --- lib/boilerpipe/filters/block_proximity_fusion.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 8ec40f0..9013cb0 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -40,7 +40,7 @@ def process(doc) diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 if diff_blocks <= @max_blocks_distance ok = true - ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only + ok = false if prev_block.is_not_content? && @content_only ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only if ok From bf4d41b4604f11ba244cf3cac2b3ecc82b79fdaf Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 01:54:32 -0400 Subject: [PATCH 02/30] TextBlock#merge adds num_words --- spec/document/text_block_spec.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index bbc3d34..ebe41bf 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -50,6 +50,12 @@ module Boilerpipe subject.merge_next(another_block) expect(subject.text).to eq "hello\ngood-bye" end + + it 'num words gets combined' do + another_block = Document::TextBlock.new('good-bye', 1) + subject.merge_next(another_block) + expect(subject.num_words).to eq 1 + end end describe '#add_label' do From 49547b090778d364e39b01e2579f97a09cc8fb56 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 11:21:43 -0400 Subject: [PATCH 03/30] num_words_text_anchor spec --- spec/document/text_block_spec.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index ebe41bf..d2bd81f 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -56,6 +56,12 @@ module Boilerpipe subject.merge_next(another_block) expect(subject.num_words).to eq 1 end + + it 'num_words_in_anchor_text gets combined' do + another_block = Document::TextBlock.new('good-bye', 1, 1) + subject.merge_next(another_block) + expect(subject.num_words_in_anchor_text).to eq 1 + end end describe '#add_label' do From f8de958b64ca7daa406520845f511d50e44979a8 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 11:38:38 -0400 Subject: [PATCH 04/30] spec for num_wrapped lines --- spec/document/text_block_spec.rb | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index d2bd81f..297f0f0 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -62,6 +62,19 @@ module Boilerpipe subject.merge_next(another_block) expect(subject.num_words_in_anchor_text).to eq 1 end + + it 'num_words_in_wrapped_lines gets combined' do + another_block = Document::TextBlock.new('good-bye', 1, 1, 1) + subject.merge_next(another_block) + expect(subject.num_words_in_wrapped_lines).to eq 1 + end + + it 'num_wrapped_lines gets combined' do + # one by default + another_block = Document::TextBlock.new('good-bye', 1, 1, 1) + subject.merge_next(another_block) + expect(subject.num_wrapped_lines).to eq 2 + end end describe '#add_label' do From 666fae5aeb6185dddf0c10151efbd5c92b0b7941 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 11:45:15 -0400 Subject: [PATCH 05/30] remove num_full_text_words not used by anything --- lib/boilerpipe/document/text_block.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index 3f0bc50..4c33110 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -5,7 +5,7 @@ class TextBlock attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text, :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density, - :link_density, :labels, :tag_level, :num_full_text_words + :link_density, :labels, :tag_level attr_accessor :content @@ -16,7 +16,6 @@ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_w @num_words_in_anchor_text = num_words_in_anchor_text @num_words_in_wrapped_lines = num_words_in_wrapped_lines @num_wrapped_lines = num_wrapped_lines - @num_full_text_words = 0 @offset_blocks_start = offset_blocks @offset_blocks_end = offset_blocks @content = false @@ -70,8 +69,6 @@ def merge_next(other) init_densities @content |= other.is_content? - @num_full_text_words += other.num_full_text_words - if other.labels if @labels.nil? @labels = other.labels.clone From 3d78b74ccd61a1503e5eacba3b9c41bf2ab686ed Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 11:52:01 -0400 Subject: [PATCH 06/30] spec for #merge start takes min starting block index --- spec/document/text_block_spec.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index 297f0f0..075de00 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -75,6 +75,13 @@ module Boilerpipe subject.merge_next(another_block) expect(subject.num_wrapped_lines).to eq 2 end + + it 'offset_block_start uses the earlier start' do + block = Document::TextBlock.new('one', 1, 1, 1, 1, 5) + another_block = Document::TextBlock.new('two', 1, 1, 1, 1, 3) + block.merge_next(another_block) + expect(block.offset_blocks_start).to eq 3 + end end describe '#add_label' do From 1234c4260eab2cc5b488901fcee087c9d6c4b561 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 11:58:28 -0400 Subject: [PATCH 07/30] spec for merge - use the later end block --- spec/document/text_block_spec.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index 075de00..a0c3527 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -82,6 +82,13 @@ module Boilerpipe block.merge_next(another_block) expect(block.offset_blocks_start).to eq 3 end + + it 'offset_block_end uses the later end' do + block = Document::TextBlock.new('one', 1, 1, 1, 1, 5) + another_block = Document::TextBlock.new('two', 1, 1, 1, 1, 3) + block.merge_next(another_block) + expect(block.offset_blocks_end).to eq 5 + end end describe '#add_label' do From 76a4511150430aad1fc0792f8fd3e6efd91e463a Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 21:15:05 -0400 Subject: [PATCH 08/30] spec for densities --- spec/document/text_block_spec.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index a0c3527..6990406 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -89,6 +89,16 @@ module Boilerpipe block.merge_next(another_block) expect(block.offset_blocks_end).to eq 5 end + + it 'recomputes densities' do + block = Document::TextBlock.new('one', 10, 5, 10, 2, 5) + another_block = Document::TextBlock.new('two', 10, 5, 10, 3, 3) + + block.merge_next(another_block) + + expect(block.text_density).to eq 4.0 + expect(block.link_density).to eq 0.5 + end end describe '#add_label' do From 0824787d400260aaf0de17b0725d36c5070a1064 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 22:42:50 -0400 Subject: [PATCH 09/30] num_wrapped_lines needs to default to 1 --- lib/boilerpipe/document/text_block.rb | 3 +-- spec/document/text_block_spec.rb | 10 ++++++++++ spec/filters/simple_block_fusion_processor_spec.rb | 8 ++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index 4c33110..433126d 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -25,7 +25,7 @@ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_w end def self.empty_start - new('', 0, 0, 0, 0, -1) + new('', 0, 0, 0, 1, -1) end def set_tag_level(level) @@ -98,7 +98,6 @@ def clone def init_densities if @num_words_in_wrapped_lines == 0 @num_words_in_wrapped_lines = @num_words - @num_wrapped_lines = 1 end @text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f @link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index 6990406..bd64a1e 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -99,6 +99,16 @@ module Boilerpipe expect(block.text_density).to eq 4.0 expect(block.link_density).to eq 0.5 end + + it 'resets wrapped lines' do + block = Document::TextBlock.new('one', 10) + another_block = Document::TextBlock.new('two', 10) + + block.merge_next(another_block) + + expect(block.num_words_in_wrapped_lines).to eq 20 + expect(block.num_wrapped_lines).to eq 2 + end end describe '#add_label' do diff --git a/spec/filters/simple_block_fusion_processor_spec.rb b/spec/filters/simple_block_fusion_processor_spec.rb index 3dea759..61da407 100644 --- a/spec/filters/simple_block_fusion_processor_spec.rb +++ b/spec/filters/simple_block_fusion_processor_spec.rb @@ -6,10 +6,10 @@ module Boilerpipe::Filters let!(:doc) { Boilerpipe::Document::TextDocument.new('', text_blocks) } context 'where blocks have same text density' do - let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 0, 0) } - let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 0, 1) } - let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 0, 2) } - let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 0, 3) } + let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 1, 0) } + let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 1, 1) } + let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 1, 2) } + let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 1, 3) } it 'the blocks are merged' do expect(doc.text_blocks.size).to eq 4 SimpleBlockFusionProcessor.process(doc) From a5f76ad4011da9dee28b872eb063403d0f5b1caf Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 23:22:40 -0400 Subject: [PATCH 10/30] merge OR's content flag --- spec/document/text_block_spec.rb | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index bd64a1e..6834ff2 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -109,6 +109,18 @@ module Boilerpipe expect(block.num_words_in_wrapped_lines).to eq 20 expect(block.num_wrapped_lines).to eq 2 end + + it 'if one is content the merged block is content' do + block = Document::TextBlock.new('one') + block.content = false + + another_block = Document::TextBlock.new('two') + another_block.content = true + + block.merge_next(another_block) + + expect(block.content).to eq true + end end describe '#add_label' do From 6be50e42ff8a7523e4658eba9af41bb4aa90117d Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 23:27:48 -0400 Subject: [PATCH 11/30] reorder reads better --- lib/boilerpipe/document/text_block.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index 433126d..f6fa83f 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -66,8 +66,8 @@ def merge_next(other) @num_wrapped_lines += other.num_wrapped_lines @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max - init_densities @content |= other.is_content? + init_densities if other.labels if @labels.nil? From 94305a6ff1eb1ee4177bae2b5ce37517cfc62b90 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 23:36:03 -0400 Subject: [PATCH 12/30] merges labels --- spec/document/text_block_spec.rb | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index 6834ff2..b694f22 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -121,6 +121,18 @@ module Boilerpipe expect(block.content).to eq true end + + it 'merges labels' do + block = Document::TextBlock.new('one') + block.add_label('boom') + + another_block = Document::TextBlock.new('two') + another_block.add_label('pow') + + block.merge_next(another_block) + + expect(block.labels).to eq Set.new(['boom', 'pow']) + end end describe '#add_label' do From cd12dd04612b42556fda9664e6d31a0fd2b3bdeb Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 23:57:06 -0400 Subject: [PATCH 13/30] no need for label check, constructor creates an empty set --- lib/boilerpipe/document/text_block.rb | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index f6fa83f..cc28ccb 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -69,13 +69,7 @@ def merge_next(other) @content |= other.is_content? init_densities - if other.labels - if @labels.nil? - @labels = other.labels.clone - else - @labels.merge(other.labels.clone) - end - end + @labels.merge(other.labels.clone) @tag_level = [@tag_level, other.tag_level].min end From 2cf7f3e03df86390a5bab4aebc052f627aafe7b4 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Tue, 28 Jul 2020 23:57:59 -0400 Subject: [PATCH 14/30] move code for readability --- lib/boilerpipe/document/text_block.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index cc28ccb..7a588f2 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -67,9 +67,8 @@ def merge_next(other) @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max @content |= other.is_content? - init_densities - @labels.merge(other.labels.clone) + init_densities @tag_level = [@tag_level, other.tag_level].min end From f3806d9aa70a1442c3cd4e9cb2b9e319366124b5 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Wed, 29 Jul 2020 00:04:38 -0400 Subject: [PATCH 15/30] merges tag levels --- lib/boilerpipe/document/text_block.rb | 4 ++-- spec/document/text_block_spec.rb | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index 7a588f2..8d03381 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -68,9 +68,9 @@ def merge_next(other) @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max @content |= other.is_content? @labels.merge(other.labels.clone) - init_densities - @tag_level = [@tag_level, other.tag_level].min + + init_densities end def to_s diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index b694f22..fb93dc6 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -133,6 +133,18 @@ module Boilerpipe expect(block.labels).to eq Set.new(['boom', 'pow']) end + + it 'sets the tag level to the minimum of the two blocks' do + block = Document::TextBlock.new('one') + block.set_tag_level(2) + + another_block = Document::TextBlock.new('two') + another_block.set_tag_level(1) + + block.merge_next(another_block) + + expect(block.tag_level).to eq 1 + end end describe '#add_label' do From dec0ca2b335e14815c61c1d2617c42074a266431 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Wed, 29 Jul 2020 17:37:22 -0400 Subject: [PATCH 16/30] clean up text_block#to_s --- lib/boilerpipe/document/text_block.rb | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index 8d03381..4c9e9e9 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -74,12 +74,15 @@ def merge_next(other) end def to_s - # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText(); - labels = 'null' - if !@labels.empty? - labels = "[#{@labels.to_a.join(',')}]" + "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels_to_s}\n#{text}" + end + + def labels_to_s + if @labels.empty? + 'null' + else + "[#{@labels.to_a.join(',')}]" end - "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}" end def clone From ae062248a729c3b603daae4fa90397f2fb213488 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Fri, 7 Aug 2020 17:14:51 -0400 Subject: [PATCH 17/30] remove dead code --- lib/boilerpipe.rb | 1 - lib/boilerpipe/sax/tag_actions/block_level.rb | 17 --------------- spec/sax/tag_actions/block_level_spec.rb | 21 ------------------- 3 files changed, 39 deletions(-) delete mode 100644 lib/boilerpipe/sax/tag_actions/block_level.rb delete mode 100644 spec/sax/tag_actions/block_level_spec.rb diff --git a/lib/boilerpipe.rb b/lib/boilerpipe.rb index 6a7cf39..8bce16c 100644 --- a/lib/boilerpipe.rb +++ b/lib/boilerpipe.rb @@ -50,7 +50,6 @@ require 'boilerpipe/sax/tag_actions/body' require 'boilerpipe/sax/tag_actions/inline_whitespace' require 'boilerpipe/sax/tag_actions/inline_no_whitespace' -require 'boilerpipe/sax/tag_actions/block_level' require 'boilerpipe/sax/tag_actions/font' require 'boilerpipe/sax/tag_actions/inline_tag_label' require 'boilerpipe/sax/tag_actions/block_tag_label' diff --git a/lib/boilerpipe/sax/tag_actions/block_level.rb b/lib/boilerpipe/sax/tag_actions/block_level.rb deleted file mode 100644 index 321f7ba..0000000 --- a/lib/boilerpipe/sax/tag_actions/block_level.rb +++ /dev/null @@ -1,17 +0,0 @@ -module Boilerpipe::SAX::TagActions - # Explicitly marks this tag a simple "block-level" element, - # which always generates whitespace - class BlockLevel - def start(handler, name, attrs) - true - end - - def end_tag(handler, name) - true - end - - def changes_tag_level? - true - end - end -end diff --git a/spec/sax/tag_actions/block_level_spec.rb b/spec/sax/tag_actions/block_level_spec.rb deleted file mode 100644 index 5e8471d..0000000 --- a/spec/sax/tag_actions/block_level_spec.rb +++ /dev/null @@ -1,21 +0,0 @@ -require 'spec_helper' - -module Boilerpipe::SAX::TagActions - describe BlockLevel do - describe '#start' do - it 'returns true' do - expect(subject.start(nil, nil, nil)).to be true - end - end - describe '#end_tag' do - it 'returns true' do - expect(subject.end_tag(nil, nil)).to be true - end - end - describe '#changes_tag_level?' do - it 'returns true' do - expect(subject.changes_tag_level?).to be true - end - end - end -end From 4dd160dd72c69f1e721ede6020646a1f73e2e3f0 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Fri, 7 Aug 2020 17:16:01 -0400 Subject: [PATCH 18/30] add simplecov --- boilerpipe-ruby.gemspec | 1 + spec/spec_helper.rb | 2 ++ 2 files changed, 3 insertions(+) diff --git a/boilerpipe-ruby.gemspec b/boilerpipe-ruby.gemspec index 54be038..a492123 100644 --- a/boilerpipe-ruby.gemspec +++ b/boilerpipe-ruby.gemspec @@ -23,5 +23,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'rake', '>= 12.3.3' spec.add_development_dependency 'rickshaw', '~> 0.5.0' spec.add_development_dependency 'rspec', '~> 3.10' + spec.add_development_dependency 'simplecov', '~> 0.18.5' spec.add_runtime_dependency 'nokogiri', '~> 1.10' end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 15499af..0b42506 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,2 +1,4 @@ +require 'simplecov' +SimpleCov.start $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) require 'boilerpipe' From dad94f60ce8778f01827ff82b05d8268492ffc04 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Thu, 20 Aug 2020 22:42:50 -0400 Subject: [PATCH 19/30] delete comments --- lib/boilerpipe/sax/html_content_handler.rb | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index 612bcd8..08527a5 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -192,16 +192,6 @@ def is_word?(word) word =~ VALID_WORD_CHARACTER end - # public void flushBlock() { - # int numWords = 0; - # int numLinkedWords = 0; - # int numWrappedLines = 0; - # int currentLineLength = -1; // don't count the first space - # final int maxLineLength = 80; - # int numTokens = 0; - # int numWordsCurrentLine = 0; - # } - def increase_in_ignorable_element! @in_ignorable_element += 1 end From cf913e1503e8d3ed6cdfbb6d8acd6af3cdc93a5b Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Thu, 20 Aug 2020 22:44:07 -0400 Subject: [PATCH 20/30] clean up whitespace event logic --- lib/boilerpipe/sax/html_content_handler.rb | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index 08527a5..2e7b21f 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -10,7 +10,6 @@ def initialize @label_stacks = [] @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions @tag_level = 0 - @sb_last_was_whitespace = false @text_buffer = '' @token_buffer = '' @offset_blocks = 0 @@ -61,7 +60,6 @@ def characters(text) # add a single space if the block was only whitespace if text.empty? append_space - @last_event = :WHITESPACE return end @@ -72,7 +70,6 @@ def characters(text) append_text(text) append_space if ended_with_whitespace - @last_event = :CHARACTERS end def end_element(name) @@ -112,7 +109,7 @@ def flush_block when 0 return when 1 - clear_buffers if @sb_last_was_whitespace + clear_buffers if @last_event == :WHITESPACE return end @@ -230,16 +227,15 @@ def add_text_block(text_block) # append space if last character wasn't already one def append_space - return if @sb_last_was_whitespace - - @sb_last_was_whitespace = true + return if @last_event == :WHITESPACE + @last_event = :WHITESPACE @text_buffer << ' ' @token_buffer << ' ' end def append_text(text) - @sb_last_was_whitespace = false + @last_event = :CHARACTERS @text_buffer << text @token_buffer << text end From 5c9dd1624dc83bd305f0c8b82bf9117a313e737d Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Thu, 20 Aug 2020 22:46:26 -0400 Subject: [PATCH 21/30] clean up token count logic --- lib/boilerpipe/sax/html_content_handler.rb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index 2e7b21f..a441082 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -113,7 +113,6 @@ def flush_block return end - num_tokens = 0 num_words = 0 num_words_current_line = 0 num_words_in_wrapped_lines = 0 @@ -129,7 +128,6 @@ def flush_block elsif ANCHOR_TEXT_END == token @in_anchor_text = false elsif is_word?(token) - num_tokens += 1 num_words += 1 num_words_current_line += 1 num_linked_words += 1 if @in_anchor_text @@ -141,12 +139,10 @@ def flush_block current_line_length = token_length num_words_current_line = 1 end - else - num_tokens += 1 end end - return if num_tokens == 0 + return if tokens.empty? num_words_in_wrapped_lines = 0 if num_wrapped_lines == 0 From ab58372e7a7628dee0505600445f11aa13024123 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Thu, 20 Aug 2020 22:47:28 -0400 Subject: [PATCH 22/30] remove extra space from tokens --- lib/boilerpipe/sax/tag_actions/anchor_text.rb | 2 -- spec/sax/tag_actions/anchor_text_spec.rb | 10 ++++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/boilerpipe/sax/tag_actions/anchor_text.rb b/lib/boilerpipe/sax/tag_actions/anchor_text.rb index a9b6dd7..3097db0 100644 --- a/lib/boilerpipe/sax/tag_actions/anchor_text.rb +++ b/lib/boilerpipe/sax/tag_actions/anchor_text.rb @@ -29,13 +29,11 @@ def changes_tag_level? def append_anchor_text_start(handler) handler.append_space handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START) - handler.append_token(' ') end def append_anchor_text_end(handler) handler.append_space handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END) - handler.append_token(' ') end def nested_achor_tag_error_recovering(handler, name) diff --git a/spec/sax/tag_actions/anchor_text_spec.rb b/spec/sax/tag_actions/anchor_text_spec.rb index c77f5fc..22447f8 100644 --- a/spec/sax/tag_actions/anchor_text_spec.rb +++ b/spec/sax/tag_actions/anchor_text_spec.rb @@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions it 'adds anchor text start' do handler = Boilerpipe::SAX::HTMLContentHandler.new - expect { subject.start(handler, nil, nil) }.to change { handler.token_buffer_size }.from(0).to(5) + expect { subject.start(handler, nil, nil) }.to change { handler.token_buffer_size }.from(0).to(4) end it 'returns false' do @@ -24,15 +24,16 @@ module Boilerpipe::SAX::TagActions subject.start(handler, nil, nil) expect(handler.in_anchor_tag).to eq(1) end + it 'doesnt append end anchor text' do handler = Boilerpipe::SAX::HTMLContentHandler.new expect { subject.start(handler, nil, nil) }.to change { handler.in_anchor_tag }.from(0).to(1) # puts handler.token_buffer - expect(handler.token_buffer_size).to eq(5) + expect(handler.token_buffer_size).to eq(4) subject.start(handler, nil, nil) # puts handler.token_buffer - expect(handler.token_buffer_size).to eq(5) + expect(handler.token_buffer_size).to eq(4) end end end @@ -47,7 +48,8 @@ module Boilerpipe::SAX::TagActions it 'adds end anchor text' do handler = Boilerpipe::SAX::HTMLContentHandler.new handler.in_anchor_tag = 1 - expect { subject.end_tag(handler, nil) }.to change { handler.token_buffer_size }.from(0).to(5) + expect { subject.end_tag(handler, nil) }.to change { handler.token_buffer_size }.from(0).to(4) + puts Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END.size end context 'if in nested anchor tag' do From 716bf594770d3c6aec741137e5630310695cec67 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Fri, 21 Aug 2020 00:04:33 -0400 Subject: [PATCH 23/30] clean up / split up label logic --- lib/boilerpipe/sax/html_content_handler.rb | 26 ++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index a441082..63edf6d 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -7,7 +7,7 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document ANCHOR_TEXT_END = ">\ue00a$" def initialize - @label_stacks = [] + @label_stacks = [[]] @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions @tag_level = 0 @text_buffer = '' @@ -27,7 +27,7 @@ def initialize end def start_element(name, attrs = []) - @label_stacks << nil + @label_stacks << [] tag = name.upcase.intern tag_action = @tag_actions[tag] @@ -161,6 +161,7 @@ def flush_block @offset_blocks += 1 clear_buffers text_block.set_tag_level(@block_tag_level) + classify_text_block_with_labels(text_block) add_text_block(text_block) @block_tag_level = -1 end @@ -210,14 +211,16 @@ def in_anchor_tag? @in_anchor_tag > 0 end - def add_text_block(text_block) - @label_stacks.each do |stack| - next unless stack - - stack.each do |label_action| - text_block.add_label(label_action.labels) if label_action - end + def classify_text_block_with_labels(text_block) + @label_stacks + .flatten + .filter{|stack| stack} + .each do |label_action| + text_block.add_label(label_action.labels) end + end + + def add_text_block(text_block) @text_blocks << text_block end @@ -242,11 +245,6 @@ def append_token(token) def add_label_action(label_action) label_stack = @label_stacks.last - if label_stack.nil? - label_stack = [] - @label_stacks.pop - @label_stacks << label_stack - end label_stack << label_action end From 48143187704b4c9ef732df6a325183bd29d3542e Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Fri, 21 Aug 2020 00:05:04 -0400 Subject: [PATCH 24/30] update specs --- spec/sax/handler_spec.rb | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/spec/sax/handler_spec.rb b/spec/sax/handler_spec.rb index 398a851..f4baf0a 100644 --- a/spec/sax/handler_spec.rb +++ b/spec/sax/handler_spec.rb @@ -26,6 +26,14 @@ module Boilerpipe::SAX end describe '#flush_block' do + it 'resets flush' + it 'sets title with last text from TITLE tag' + it 'clears out text_buffer' + it 'clears out token_buffer' + it 'determins line and word counts' + it 'creates text block' + it 'classifies text block with labels' + it 'adds text block to document' end describe '#text_document' do @@ -68,14 +76,14 @@ module Boilerpipe::SAX end describe '#add_label_action' do - context 'with a nil as the last element in the label stacks' do + context 'with an array as the last element in the label stacks' do before { subject.start_element('boom') } - it 'removes that nil' do - expect(subject.label_stacks.first).to eq nil + it 'adds the label' do + expect(subject.label_stacks.last).to eq [] subject.add_label_action(:boom) - expect(subject.label_stacks.first).to eq [:boom] - expect(subject.label_stacks.size).to eq 1 + expect(subject.label_stacks.last).to eq [:boom] + expect(subject.label_stacks.size).to eq 2 end end end From 800dc59c914ac79c2e850120f8c627ac458edcf1 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Mon, 15 Feb 2021 14:54:11 -0500 Subject: [PATCH 25/30] remove verbose unnecessary logic --- lib/boilerpipe/filters/block_proximity_fusion.rb | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 9013cb0..d41a926 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -18,12 +18,7 @@ def process(doc) text_blocks = doc.text_blocks return false if text_blocks.size < 2 - prev_block = if @content_only - text_blocks.find { |tb| tb.is_content? } - else - text_blocks.first - end - + prev_block = text_blocks.first return false if prev_block.nil? offset = text_blocks.index(prev_block) + 1 From f05cf0e0dbb71da003355c01ee712f11b2ca8cf7 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Mon, 15 Feb 2021 14:58:37 -0500 Subject: [PATCH 26/30] invert boolean to reduce nesting --- .../filters/block_proximity_fusion.rb | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index d41a926..02fd39b 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -33,19 +33,20 @@ def process(doc) end diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 - if diff_blocks <= @max_blocks_distance - ok = true - ok = false if prev_block.is_not_content? && @content_only - ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only - - if ok - prev_block.merge_next(tb) - blocks_to_remove << tb - else - prev_block = tb - end + next if diff_blocks > @max_blocks_distance + + ok = true + ok = false if prev_block.is_not_content? && @content_only + ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only + + if ok + prev_block.merge_next(tb) + blocks_to_remove << tb + else + prev_block = tb end end + doc.replace_text_blocks!(text_blocks - blocks_to_remove) doc end From 31f3a4f0c92391896b15992c55188a97bb5958b1 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Mon, 15 Feb 2021 15:00:38 -0500 Subject: [PATCH 27/30] reorder boolean checks --- lib/boilerpipe/filters/block_proximity_fusion.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 02fd39b..cf38cf1 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -36,8 +36,8 @@ def process(doc) next if diff_blocks > @max_blocks_distance ok = true - ok = false if prev_block.is_not_content? && @content_only - ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only + ok = false if @content_only && prev_block.is_not_content? + ok = false if ok && @same_tag_level_only && prev_block.tag_level != tb.tag_level if ok prev_block.merge_next(tb) From 647a6f1923da737942443a28d7b679efb14c3e06 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Fri, 21 Aug 2020 00:20:57 -0400 Subject: [PATCH 28/30] formatting formatting --- lib/boilerpipe/sax/html_content_handler.rb | 3 ++- spec/sax/tag_actions/block_tag_label_spec.rb | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index 63edf6d..d8b4f18 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -156,7 +156,8 @@ def flush_block num_words, num_linked_words, num_words_in_wrapped_lines, - num_wrapped_lines, @offset_blocks) + num_wrapped_lines, + @offset_blocks) @offset_blocks += 1 clear_buffers diff --git a/spec/sax/tag_actions/block_tag_label_spec.rb b/spec/sax/tag_actions/block_tag_label_spec.rb index f30ec0d..b24e73d 100644 --- a/spec/sax/tag_actions/block_tag_label_spec.rb +++ b/spec/sax/tag_actions/block_tag_label_spec.rb @@ -8,16 +8,19 @@ module Boilerpipe::SAX::TagActions describe '.new' do it 'takes a label action' end + describe '#start' do it 'returns true' do expect(subject.start(handler, nil, nil)).to be true end end + describe '#end_tag' do it 'returns true' do expect(subject.end_tag(handler, nil)).to be true end end + describe '#changes_tag_level?' do it 'returns true' do expect(subject.changes_tag_level?).to be true From 6545233642716cbd2d905df6282f5bff93c1ec74 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Sat, 13 Feb 2021 15:09:58 -0500 Subject: [PATCH 29/30] wip --- Gemfile.lock | 47 +++++++++++++++++++++ lib/boilerpipe/sax/html_content_handler.rb | 8 +++- spec/filters/block_proximity_fusion_spec.rb | 14 ++++-- 3 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 Gemfile.lock diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..0ff1925 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,47 @@ +PATH + remote: . + specs: + boilerpipe-ruby (0.4.3) + nokogiri (~> 1.10) + +GEM + remote: https://rubygems.org/ + specs: + diff-lcs (1.4.4) + docile (1.3.2) + mini_portile2 (2.4.0) + nokogiri (1.10.10) + mini_portile2 (~> 2.4.0) + rake (13.0.1) + rickshaw (0.5.0) + rspec (3.9.0) + rspec-core (~> 3.9.0) + rspec-expectations (~> 3.9.0) + rspec-mocks (~> 3.9.0) + rspec-core (3.9.2) + rspec-support (~> 3.9.3) + rspec-expectations (3.9.2) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.9.0) + rspec-mocks (3.9.1) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.9.0) + rspec-support (3.9.3) + simplecov (0.18.5) + docile (~> 1.1) + simplecov-html (~> 0.11) + simplecov-html (0.12.2) + +PLATFORMS + ruby + +DEPENDENCIES + boilerpipe-ruby! + bundler (~> 2.0) + rake (>= 12.3.3) + rickshaw (~> 0.5.0) + rspec (~> 3.9) + simplecov (~> 0.18.5) + +BUNDLED WITH + 2.1.4 diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index d8b4f18..bd9b34f 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -31,6 +31,7 @@ def start_element(name, attrs = []) tag = name.upcase.intern tag_action = @tag_actions[tag] + org = @tag_level if tag_action @tag_level += 1 if tag_action.changes_tag_level? @flush = tag_action.start(self, name, attrs) | @flush @@ -38,6 +39,7 @@ def start_element(name, attrs = []) @tag_level += 1 @flush = true end + puts "before: #{org}, after: #{@tag_level}" @last_event = :START_TAG @last_start_tag = tag @@ -64,7 +66,11 @@ def characters(text) end # set block levels - @block_tag_level = @tag_level if @block_tag_level == -1 + if @block_tag_level == -1 + puts "-1 setting block level tag_level: #{@tag_level}" + @block_tag_level = @tag_level + end + puts "block_tag_level: #{@block_tag_level}" append_space if started_with_whitespace append_text(text) diff --git a/spec/filters/block_proximity_fusion_spec.rb b/spec/filters/block_proximity_fusion_spec.rb index e09788e..d1cd849 100644 --- a/spec/filters/block_proximity_fusion_spec.rb +++ b/spec/filters/block_proximity_fusion_spec.rb @@ -2,10 +2,10 @@ module Boilerpipe::Filters describe BlockProximityFusion do - let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 0, 0) } - let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 0, 1) } - let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 0, 2) } - let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 0, 3) } + let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 1, 0) } + let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 1, 1) } + let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 1, 2) } + let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 1, 3) } let(:text_blocks) { [text_block1, text_block2, text_block3, text_block4] } let!(:doc) { Boilerpipe::Document::TextDocument.new('', text_blocks) } @@ -18,6 +18,7 @@ module Boilerpipe::Filters describe '#process' do context 'where blocks exceed distance' do + # only_content: true, same_tag_level: false it 'doesnt change blocks' do expect(doc.text_blocks.size).to eq 4 filter = BlockProximityFusion.new(1, true, false) @@ -27,10 +28,15 @@ module Boilerpipe::Filters end context 'where blocks do not exceed distance' do + # only_content: false, same_tag_level: false it 'Fuses adjacent blocks' do + puts doc.text_blocks.map(&:text).inspect + puts doc.debug_s expect(doc.text_blocks.last.text.size).to eq 4 filter = BlockProximityFusion.new(1, false, false) filter.process(doc) + puts doc.text_blocks.map(&:text).inspect + puts doc.debug_s expect(doc.text_blocks.last.text).to eq "three\nfour" end From 1f0a33548fe6ed3981aaab58ee01a0093b101fc5 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Sun, 14 Feb 2021 19:33:46 -0500 Subject: [PATCH 30/30] remove gemfile lock --- Gemfile.lock | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) delete mode 100644 Gemfile.lock diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index 0ff1925..0000000 --- a/Gemfile.lock +++ /dev/null @@ -1,47 +0,0 @@ -PATH - remote: . - specs: - boilerpipe-ruby (0.4.3) - nokogiri (~> 1.10) - -GEM - remote: https://rubygems.org/ - specs: - diff-lcs (1.4.4) - docile (1.3.2) - mini_portile2 (2.4.0) - nokogiri (1.10.10) - mini_portile2 (~> 2.4.0) - rake (13.0.1) - rickshaw (0.5.0) - rspec (3.9.0) - rspec-core (~> 3.9.0) - rspec-expectations (~> 3.9.0) - rspec-mocks (~> 3.9.0) - rspec-core (3.9.2) - rspec-support (~> 3.9.3) - rspec-expectations (3.9.2) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.9.0) - rspec-mocks (3.9.1) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.9.0) - rspec-support (3.9.3) - simplecov (0.18.5) - docile (~> 1.1) - simplecov-html (~> 0.11) - simplecov-html (0.12.2) - -PLATFORMS - ruby - -DEPENDENCIES - boilerpipe-ruby! - bundler (~> 2.0) - rake (>= 12.3.3) - rickshaw (~> 0.5.0) - rspec (~> 3.9) - simplecov (~> 0.18.5) - -BUNDLED WITH - 2.1.4