class Tjcrawler::Parser

Public Class Methods

new() { |'Block required'| ... } click to toggle source

a nokogiri doc will be yield in block

# File lib/tjcrawler/parser.rb, line 7
def initialize &block
  yield 'Block required' unless block_given?
  @strategy = block
end

Public Instance Methods

parse(content) click to toggle source
# File lib/tjcrawler/parser.rb, line 12
def parse content
  doc = Nokogiri::HTML(content)
  ret = nil
  @@semaphore.synchronize{ ret = @strategy[doc] }
  print :'.'
  ret
end
start() click to toggle source
# File lib/tjcrawler/parser.rb, line 20
def start
  loop do
    sleep 1 until page = find_next
    parse(page.content)
    page.touch(:parsed_at)
  end
end

Private Instance Methods

find_next() click to toggle source
# File lib/tjcrawler/parser.rb, line 30
def find_next
  Page.where('crawled_at IS NOT NULL AND (parsed_at IS NULL OR parsed_at < ?)', 1.day.ago).order('parsed_at IS NOT NULL, parsed_at').first
end