class RecipeCrawler::Crawler

This is the main class to crawl recipes from a given url

1. Crawler will crawl url to find others recipes urls on the website
2. it will crawl urls founded to find other url again & again
3. it will scrape urls founded to get data

@attr_reader url [String] first url parsed @attr_reader host [Symbol] of url's host @attr_reader scraped_urls [Array<String>] of url's host @attr_reader crawled_urls [Array<String>] of url's host @attr_reader to_crawl_urls [Array<String>] of url's host @attr_reader recipes [Array<RecipeScraper::Recipe>] recipes fetched @attr_reader db [SQLite3::Database] Sqlite database where recipe will be saved

Constants

ALLOWED_URLS

URL than crawler can parse

Attributes

crawled_urls[R]
host[R]
interval_sleep_time[RW]
recipes[R]
scraped_urls[R]
to_crawl_urls[R]
url[R]

Public Class Methods

new(url) click to toggle source

Create a Crawler @param url [String] a url a recipe to scrawl other one

# File lib/recipe_crawler/crawler.rb, line 33
def initialize(url)
  @url = url
  if url_valid?
    @recipes = []
    @crawled_urls = []
    @scraped_urls = []
    @to_crawl_urls = []
    @to_crawl_urls << url
    @interval_sleep_time = 0
    @db = SQLite3::Database.new 'results.sqlite3'
    @db.execute "CREATE TABLE IF NOT EXISTS recipes(
                                    Id INTEGER PRIMARY KEY,
                                    title TEXT,
                                    preptime INTEGER,
                                    cooktime INTEGER,
                                    ingredients TEXT,
                                    steps TEXT,
                                    image TEXT
                            )"
  else
    raise ArgumentError, 'This url cannot be used'
  end
end

Public Instance Methods

crawl!(limit: 2, interval_sleep_time: 0) { |recipe| ... } click to toggle source

Start the crawl

@param limit [Integer] the maximum number of scraped recipes @param interval_sleep_time [Integer] waiting time between scraping @yield [RecipeScraper::Recipe] as recipe scraped

# File lib/recipe_crawler/crawler.rb, line 76
def crawl!(limit: 2, interval_sleep_time: 0)
  recipes_returned = 0

  if @host == :cuisineaz

    while !@to_crawl_urls.empty? && (limit > @recipes.count)
      # find all link on url given (and urls of theses)
      url = @to_crawl_urls.first
      next if url.nil?

      get_links url
      # now scrape an url
      recipe = scrape url
      yield recipe if recipe && block_given?
      sleep interval_sleep_time
    end

  else
    raise NotImplementedError
  end
end
save(recipe) click to toggle source

Save recipe @param recipe [RecipeScraper::Recipe] as recipe to save

@return [Boolean] as true if success

# File lib/recipe_crawler/crawler.rb, line 147
def save(recipe)
  @db.execute "INSERT INTO recipes (title, preptime, cooktime, ingredients, steps, image)
                                            VALUES (:title, :preptime, :cooktime, :ingredients, :steps, :image)",
              title: recipe.title,
              preptime: recipe.preptime,
              ingredients: recipe.ingredients.join("\n"),
              steps: recipe.steps.join("\n"),
              image: recipe.image

  true
rescue SQLite3::Exception => e
  puts "Exception occurred #{e}"
  false
end
scrape(url) click to toggle source

Scrape given url param url [String] as url to scrape

@return [RecipeScraper::Recipe] as recipe scraped @return [nil] if recipe connat be fetched

# File lib/recipe_crawler/crawler.rb, line 104
def scrape(url)
  recipe = RecipeScraper::Recipe.new url
  @scraped_urls << url
  @recipes << recipe
  if save recipe
    return recipe
  else
    raise SQLite3::Exception, 'cannot save recipe'
  end
rescue OpenURI::HTTPError
  nil
end
url_valid?() click to toggle source

Check if the url can be parsed and set the host

@return [Boolean] true if url can be parsed

# File lib/recipe_crawler/crawler.rb, line 61
def url_valid?
  ALLOWED_URLS.each do |host, url_allowed|
    if url.include? url_allowed
      @host = host
      return true
    end
  end
  false
end