class Newly::PageCrawler
Public Class Methods
new(host, document)
click to toggle source
# File lib/newly/page_crawler.rb, line 3 def initialize(host, document) @host = host @document = document end
Public Instance Methods
image(element)
click to toggle source
# File lib/newly/page_crawler.rb, line 28 def image(element) image = find(element, 'src') if (image && image.include?("==/")) image = "http://#{image.split("==/").last}" end image = "#{@host}/#{image}".gsub('../', '') if image && image.include?('../') image end
link(element)
click to toggle source
# File lib/newly/page_crawler.rb, line 22 def link(element) href = find(element, 'href') href = "#{@host}/#{href}".gsub('../', '') if href && !href.include?('http') href end
text(element)
click to toggle source
# File lib/newly/page_crawler.rb, line 15 def text(element) if valid?(element) text = get(element).text text if valid?(text) end end
titleize(element)
click to toggle source
# File lib/newly/page_crawler.rb, line 8 def titleize(element) title = text(element) title[0] = title.capitalize[0] if title title end
Private Instance Methods
find(element, type)
click to toggle source
# File lib/newly/page_crawler.rb, line 46 def find(element, type) get(element).map { |doc| doc[type] }.first if valid?(element) end
get(element)
click to toggle source
# File lib/newly/page_crawler.rb, line 42 def get(element) @document.css(element) end
valid?(str)
click to toggle source
# File lib/newly/page_crawler.rb, line 38 def valid?(str) str && !str.empty? end