class Boilerpipe::Document::TextBlock

Attributes

content[RW]
labels[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

num_full_text_words[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

num_words[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

num_words_in_anchor_text[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

num_words_in_wrapped_lines[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

num_wrapped_lines[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

offset_blocks_end[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

offset_blocks_start[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

tag_level[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

text[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

text_density[R]

EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)

Public Class Methods

empty_start() click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 28
def self.empty_start
  new('', 0, 0, 0, 0, -1)
end
new(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 12
def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
  @labels = Set.new
  @text = text
  @num_words = num_words
  @num_words_in_anchor_text = num_words_in_anchor_text
  @num_words_in_wrapped_lines = num_words_in_wrapped_lines
  @num_wrapped_lines = num_wrapped_lines
  @num_full_text_words = 0
  @offset_blocks_start = offset_blocks
  @offset_blocks_end = offset_blocks
  @content = false
  @tag_level = 0

  init_densities
end

Public Instance Methods

add_label(label) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 44
def add_label(label)
  @labels << label
end
add_labels(labels) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 48
def add_labels(labels)
  labels.each do |label|
    add_label(label)
  end
end
clone() click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 95
def clone
  throw NotImplementedError
end
has_label?(label) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 54
def has_label?(label)
  @labels.include?(label)
end
is_content?() click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 36
def is_content?
  @content
end
is_not_content?() click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 40
def is_not_content?
  !is_content?
end
merge_next(other) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 62
def merge_next(other)
  @text = "#{@text}\n#{other.text}"
  @num_words += other.num_words
  @num_words_in_anchor_text += other.num_words_in_anchor_text
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
  @num_wrapped_lines += other.num_wrapped_lines
  @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
  @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
  init_densities
  @content |= other.is_content?

  @num_full_text_words += other.num_full_text_words

  if other.labels
    if @labels.nil?
      @labels = other.labels.clone
    else
      @labels.merge(other.labels.clone)
    end
  end

  @tag_level = [@tag_level, other.tag_level].min
end
remove_label(label) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 58
def remove_label(label)
  @labels.delete(label)
end
set_tag_level(level) click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 32
def set_tag_level(level)
  @tag_level = level
end
to_s() click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 86
def to_s
  # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
  labels = 'null'
  if !@labels.empty?
    labels = "[#{@labels.to_a.join(',')}]"
  end
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
end

Private Instance Methods

init_densities() click to toggle source
# File lib/boilerpipe/document/text_block.rb, line 101
def init_densities
  if @num_words_in_wrapped_lines == 0
    @num_words_in_wrapped_lines = @num_words
    @num_wrapped_lines = 1
  end
  @text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f
  @link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f
end