class Sumitup::Parser

Attributes

attributes[RW]
elements[RW]
image_count[RW]
image_width_limit[RW]
max_images[RW]
max_words[RW]
min_image_size[RW]
omission[RW]
protocols[RW]
remove_contents[RW]
word_count[RW]

Public Class Methods

new(options = {}) click to toggle source
# File lib/sumitup/parser.rb, line 9
def initialize(options = {})
  
  self.omission = options[:omission] || ''
  
  self.word_count = 0
  self.max_words = options[:max_words] || 100
  
  self.image_count = 0
  self.min_image_size = options[:min_image_size] || 40
  self.image_width_limit = options[:image_width_limit] || 230
  self.max_images = options[:max_images] || 1
              
  # White listed elements
  self.elements = options[:elements] || %w(
    a abbr b blockquote cite code dfn em i kbd mark q samp small s strike strong sub sup time u var
    br dd dl dt li ol p pre ul img span
  )

  self.attributes = options[:attributes] || {
    'a'          => ['href', 'title'],
    'blockquote' => ['cite'],
    'img'        => ['alt', 'src', 'title', 'width', 'height']
  }

  self.protocols = options[:protocols] || {
    'a' => {'href' => ['http', 'https', 'mailto']}
  }

  self.remove_contents = options[:remove_contents] || %w(
    style script 
  )
  
end

Public Instance Methods

image_height(existing_height, existing_width, image_width_limit) click to toggle source
# File lib/sumitup/parser.rb, line 127
def image_height(existing_height, existing_width, image_width_limit)
  # if width is empty just set it to the default width
  existing_width ||= image_width_limit
  # if height is empty set it to width and then to the default width (not a lot of other options)
  existing_height ||= existing_width
  existing_height ||= image_width_limit
  ratio = image_width_limit.to_f/existing_width.to_f
  (existing_height.to_f * ratio).to_i
end
image_transformer() click to toggle source
# File lib/sumitup/parser.rb, line 160
def image_transformer
  me = self
  lambda do |env|
    node = env[:node]
    return unless ['img'].include?(env[:node_name])
  
    if (me.image_count+1) > me.max_images # We add a new image below so we have to make sure we won't go over the limit
      node.remove
    else
      keep_it = false

      existing_width = node.attributes['width'].value.to_i rescue nil if node.attributes['width']
      existing_height = node.attributes['height'].value.to_i rescue nil if node.attributes['height']
      
      if !existing_width || !existing_height
        image_url = node.attributes['src'] rescue nil
        existing_width, existing_height = me.request_image_size(image_url) rescue [nil, nil] if image_url
      end
        
      existing_width ||= 0
      
      keep_it = true if existing_width > me.min_image_size
      
      if keep_it
        me.image_count += 1
        node['height'] = me.image_height(existing_height, existing_width, me.image_width_limit).to_s
        node['width'] = me.image_width_limit.to_s
      else
        node.remove
      end
    
    end
  
  end
end
is_blank?(text) click to toggle source
# File lib/sumitup/parser.rb, line 111
def is_blank?(text)
  text.nil? || text.empty?
end
request_image_size(image_url) click to toggle source
# File lib/sumitup/parser.rb, line 115
def request_image_size(image_url)
  width = nil
  height = nil
  open(image_url, 'rb') do |f|
    img = Dimensions(f)
    img.read
    width = img.width
    height = img.height
  end
  [width, height]
end
snippet(text, max) click to toggle source

Truncates text at a word boundry Parameters:

text      - The text to truncate
wordcount - The number of words
# File lib/sumitup/parser.rb, line 98
def snippet(text, max)
  result = ''
  count = 0
  # TODO figure out support for pre that contains code blocks..
  return [result, count] if is_blank?(text)
  text.split.each do |word|
    return [result.strip!, count] if count >= max
    result << "#{word} "
    count += 1
  end
  [result.strip!, count]
end
summarize(html, max = nil) click to toggle source

Removes html and generate a summary

# File lib/sumitup/parser.rb, line 44
def summarize(html, max = nil)
  return '' if is_blank?(html)
  unclean = Nokogiri::HTML::DocumentFragment.parse(html.dup)
  summarize_fragment(unclean, max).to_html
end
summarize_fragment(node, max = nil) click to toggle source
# File lib/sumitup/parser.rb, line 50
def summarize_fragment(node, max = nil)
  # Always reset counts
  self.word_count = 0
  self.image_count = 0
  clean = Sanitize.clean_node!(node, 
    :elements => elements, 
    :attributes => attributes, 
    :protocols => protocols, 
    :remove_contents => remove_contents, 
    :transformers => [word_transformer, image_transformer])
  summarize_node(clean, max)
end
summarize_node(node, max = nil) click to toggle source
# File lib/sumitup/parser.rb, line 63
def summarize_node(node, max = nil)
  max ||= self.max_words
  
  # summarize all children of the node
  node.children.each do |child|
    summarize_node(child, max)
  end
  
  if node.text?
    if self.word_count > max
      node.remove
    else
      # if the text of the current node makes us go over then truncate it
      result, count = snippet(node.inner_text, max - self.word_count)
      if count == 0 || is_blank?(result)
        node.remove
      else
        self.word_count += count
        node.content = result
      end
    end
  else
    # Remove empty nodes
    if node.text.empty? && node.children.empty? && !['img', 'br'].include?(node.name)
      node.remove
    end
  end
  
  node
end
word_transformer() click to toggle source
# File lib/sumitup/parser.rb, line 137
def word_transformer
  me = self
  lambda do |env|
  
    node = env[:node]
    name = env[:node_name]
    return if !node.element?
  
    # Remove nodes with display none
    if node['style'] && node['style'] =~ /display\s*:\s*none/
      node.remove
      return
    end
  
    # Remove empty nodes
    if node.text.empty? && node.children.empty? && !['img', 'br'].include?(name)
      node.remove
      return
    end
  
  end
end