class Boilerpipe::SAX::HTMLContentHandler

Constants

ANCHOR_TEXT_END
ANCHOR_TEXT_START
VALID_WORD_CHARACTER

Attributes

font_size_stack[RW]
in_anchor_tag[RW]
in_ignorable_element[R]
label_stacks[R]
last_start_tag[R]
token_buffer[RW]

Public Class Methods

new() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 9
def initialize
  @label_stacks = []
  @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
  @tag_level = 0
  @sb_last_was_whitespace = false
  @text_buffer = ''
  @token_buffer = ''
  @offset_blocks = 0
  @flush = false
  @block_tag_level = -1

  @in_body_tag = 0
  @in_anchor_tag = 0
  @in_ignorable_element = 0
  @in_anchor_text = false
  @font_size_stack = []
  @last_start_tag = ''
  @title
  @text_blocks = []
end

Public Instance Methods

add_label_action(label_action) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 261
def add_label_action(label_action)
  label_stack = @label_stacks.last
  if label_stack.nil?
    label_stack = []
    @label_stacks.pop
    @label_stacks << label_stack
  end
  label_stack << label_action
end
add_text_block(text_block) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 230
def add_text_block(text_block)
  @label_stacks.each do |stack|
    next unless stack

    stack.each do |label_action|
      text_block.add_label(label_action.labels) if label_action
    end
  end
  @text_blocks << text_block
end
append_space() click to toggle source

append space if last character wasn't already one

# File lib/boilerpipe/sax/html_content_handler.rb, line 242
def append_space
  return if @sb_last_was_whitespace

  @sb_last_was_whitespace = true

  @text_buffer << ' '
  @token_buffer << ' '
end
append_text(text) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 251
def append_text(text)
  @sb_last_was_whitespace = false
  @text_buffer << text
  @token_buffer << text
end
append_token(token) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 257
def append_token(token)
  @token_buffer << token
end
characters(text) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 47
def characters(text)
  flush_block if @flush

  return if in_ignorable_element?
  return if text.empty?

  # replace all whitespace with simple space
  text.gsub!(/\s+/, ' ')

  # trim whitespace
  started_with_whitespace = text =~ /^\s/
  ended_with_whitespace = text =~ /\s$/
  text.strip!

  #  add a single space if the block was only whitespace
  if text.empty?
    append_space
    @last_event = :WHITESPACE
    return
  end

  # set block levels
  @block_tag_level = @tag_level if @block_tag_level == -1

  append_space if started_with_whitespace
  append_text(text)
  append_space if ended_with_whitespace

  @last_event = :CHARACTERS
end
decrease_in_ignorable_element!() click to toggle source

should we prevent less than zero here?

# File lib/boilerpipe/sax/html_content_handler.rb, line 210
def decrease_in_ignorable_element!
  @in_ignorable_element -= 1
end
end_element(name) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 78
def end_element(name)
  tag = name.upcase.intern
  tag_action = @tag_actions[tag]
  if tag_action
    @flush = tag_action.end_tag(self, name) | @flush
  else
    @flush = true
  end

  @tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level?
  flush_block if @flush

  @last_event = :END_TAG
  @last_end_tag = tag
  @label_stacks.pop
end
enter_body_tag!() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 214
def enter_body_tag!
  @in_body_tag += 1
end
exit_body_tag!() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 218
def exit_body_tag!
  @in_body_tag -= 1
end
flush_block() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 99
def flush_block
  @flush = false

  # set title
  if not_in_body_tag?
    @title = @token_buffer.strip if :TITLE == @last_start_tag
    clear_buffers
    return
  end

  # clear out if empty or just a space
  length = @token_buffer.size
  case length
  when 0
    return
  when 1
    clear_buffers if @sb_last_was_whitespace
    return
  end

  num_tokens = 0
  num_words = 0
  num_words_current_line = 0
  num_words_in_wrapped_lines = 0
  num_wrapped_lines = 0
  num_linked_words = 0
  current_line_length = 0
  max_line_length = 80

  tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer)
  tokens.each do |token|
    if ANCHOR_TEXT_START == token
      @in_anchor_text = true
    elsif ANCHOR_TEXT_END == token
      @in_anchor_text = false
    elsif is_word?(token)
      num_tokens += 1
      num_words += 1
      num_words_current_line += 1
      num_linked_words += 1 if @in_anchor_text
      token_length = token.size
      current_line_length += token_length + 1

      if current_line_length > max_line_length
        num_wrapped_lines += 1
        current_line_length = token_length
        num_words_current_line = 1
      end
    else
      num_tokens += 1
    end
  end

  return if num_tokens == 0

  num_words_in_wrapped_lines = 0
  if num_wrapped_lines == 0
    num_words_in_wrapped_lines = num_words
    num_wrapped_lines = 1
  else
    num_words_in_wrapped_lines = num_words - num_words_current_line
  end

  text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
                                                     num_words,
                                                     num_linked_words,
                                                     num_words_in_wrapped_lines,
                                                     num_wrapped_lines, @offset_blocks)

  @offset_blocks += 1
  clear_buffers
  text_block.set_tag_level(@block_tag_level)
  add_text_block(text_block)
  @block_tag_level = -1
end
in_anchor_tag?() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 226
def in_anchor_tag?
  @in_anchor_tag > 0
end
in_ignorable_element?() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 222
def in_ignorable_element?
  @in_ignorable_element > 0
end
increase_in_ignorable_element!() click to toggle source

public void flushBlock() {

int numWords = 0;
int numLinkedWords = 0;
int numWrappedLines = 0;
int currentLineLength = -1; // don't count the first space
final int maxLineLength = 80;
int numTokens = 0;
int numWordsCurrentLine = 0;

}

# File lib/boilerpipe/sax/html_content_handler.rb, line 205
def increase_in_ignorable_element!
  @in_ignorable_element += 1
end
is_word?(word) click to toggle source

unicode regex - categories p{L} – Letter p{Nd} – a decimal digit p{Nl} – a letterlike numeric character p{No} – a numeric character of other type

# File lib/boilerpipe/sax/html_content_handler.rb, line 191
def is_word?(word)
  word =~ VALID_WORD_CHARACTER
end
not_in_body_tag?() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 95
def not_in_body_tag?
  @in_body_tag == 0
end
start_element(name, attrs = []) click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 30
def start_element(name, attrs = [])
  @label_stacks << nil
  tag = name.upcase.intern

  tag_action = @tag_actions[tag]
  if tag_action
    @tag_level += 1 if tag_action.changes_tag_level?
    @flush = tag_action.start(self, name, attrs) | @flush
  else
    @tag_level += 1
    @flush = true
  end

  @last_event = :START_TAG
  @last_start_tag = tag
end
text_document() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 175
def text_document
  flush_block
  ::Boilerpipe::Document::TextDocument.new(@title, @text_blocks)
end
token_buffer_size() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 180
def token_buffer_size
  @token_buffer.size
end

Private Instance Methods

clear_buffers() click to toggle source
# File lib/boilerpipe/sax/html_content_handler.rb, line 273
def clear_buffers
  @token_buffer = ''
  @text_buffer = ''
end