class Scraper
require ‘pry’
Public Class Methods
scrape_index_page(index_url)
click to toggle source
# File lib/rubedility/scraper.rb, line 7 def self.scrape_index_page(index_url) index = Nokogiri::HTML(open(index_url)) lessons = [] index.css("div.lessons_list a").each do |lesson| print "." name = lesson.css("div.title").text number = lesson.css("div.num").text.delete("Lesson").to_i lesson_url = "".concat(index_url).concat(lesson.attr("href").split("/").last) lessons.push({:name=>name, :number=>number, :lesson_url=>lesson_url}) end return lessons end
scrape_lesson_page(lesson_url)
click to toggle source
# File lib/rubedility/scraper.rb, line 22 def self.scrape_lesson_page(lesson_url) begin print "." lesson = Nokogiri::HTML(open(lesson_url)) if lesson.css("a#readings").length > 0 reading_url = lesson.css("a#readings").attr("href").value end tests_started = lesson.css("span.started span.num").text.to_i tests_solved = lesson.css("span.finished span.num").text.to_i task_hashes_array = [] lesson.css("div.task-box").each do |task_row| name = task_row.css("h4.title").text.strip #url is just the last 'piece' of the task URL url = task_row.css("a").attr("href").text #have to add that to the end of the 'real' URL, but take off part of it task_url = lesson_url.split("/")[0..2].join("/").concat(url) difficulty = task_row.css("div.difficulty").text.strip tagline = task_row.css("div.synopsis").text.strip task_hashes_array.push({:name=>name, :task_url=>task_url, :difficulty=>difficulty, :tagline=>tagline, :task_reading_url=>reading_url}) end lesson_details = {:reading_url=>reading_url, :tests_started=>tests_started, :tests_solved=>tests_solved} #return [hash-of-lesson-details, array-of-task-detail-hashes] return [lesson_details, task_hashes_array] rescue OpenURI::HTTPError => er puts "404, Lesson not found" puts lesson_url puts er return nil else end end
scrape_task_page(task_url)
click to toggle source
# File lib/rubedility/scraper.rb, line 54 def self.scrape_task_page(task_url) print "." begin task = Nokogiri::HTML(open(task_url)) content = task.css("div.desc-rb-en div").first.text #the way they have the content is not best for command line display. #some '\n' and some '\n\n', command line looks better with '\n\n' #substitube singles for doubles content.gsub!(/[\n]+/,"\n") #substitute doubles for singles content.gsub!(/[\n]/,"\n\n") return {:content=>content} rescue OpenURI::HTTPError => er puts "404'd!" puts task_url puts er return nil else end end