class Boilerpipe::Document::TextBlock
Attributes
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
EMPTY_END = TextBlock.new
('', 0, 0, 0, 0, 999999999999999999999999999)
Public Class Methods
# File lib/boilerpipe/document/text_block.rb, line 28 def self.empty_start new('', 0, 0, 0, 0, -1) end
# File lib/boilerpipe/document/text_block.rb, line 12 def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) @labels = Set.new @text = text @num_words = num_words @num_words_in_anchor_text = num_words_in_anchor_text @num_words_in_wrapped_lines = num_words_in_wrapped_lines @num_wrapped_lines = num_wrapped_lines @num_full_text_words = 0 @offset_blocks_start = offset_blocks @offset_blocks_end = offset_blocks @content = false @tag_level = 0 init_densities end
Public Instance Methods
# File lib/boilerpipe/document/text_block.rb, line 44 def add_label(label) @labels << label end
# File lib/boilerpipe/document/text_block.rb, line 48 def add_labels(labels) labels.each do |label| add_label(label) end end
# File lib/boilerpipe/document/text_block.rb, line 95 def clone throw NotImplementedError end
# File lib/boilerpipe/document/text_block.rb, line 54 def has_label?(label) @labels.include?(label) end
# File lib/boilerpipe/document/text_block.rb, line 36 def is_content? @content end
# File lib/boilerpipe/document/text_block.rb, line 40 def is_not_content? !is_content? end
# File lib/boilerpipe/document/text_block.rb, line 62 def merge_next(other) @text = "#{@text}\n#{other.text}" @num_words += other.num_words @num_words_in_anchor_text += other.num_words_in_anchor_text @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines @num_wrapped_lines += other.num_wrapped_lines @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max init_densities @content |= other.is_content? @num_full_text_words += other.num_full_text_words if other.labels if @labels.nil? @labels = other.labels.clone else @labels.merge(other.labels.clone) end end @tag_level = [@tag_level, other.tag_level].min end
# File lib/boilerpipe/document/text_block.rb, line 58 def remove_label(label) @labels.delete(label) end
# File lib/boilerpipe/document/text_block.rb, line 32 def set_tag_level(level) @tag_level = level end
# File lib/boilerpipe/document/text_block.rb, line 86 def to_s # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText(); labels = 'null' if !@labels.empty? labels = "[#{@labels.to_a.join(',')}]" end "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}" end
Private Instance Methods
# File lib/boilerpipe/document/text_block.rb, line 101 def init_densities if @num_words_in_wrapped_lines == 0 @num_words_in_wrapped_lines = @num_words @num_wrapped_lines = 1 end @text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f @link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f end