class StarTrekCLI::Scraper
Constants
- DATE_REGEX
Public Instance Methods
each_index_group(index_url) { |cell_group| ... }
click to toggle source
This method pulls information from the Chakoteya index for each Star Trek series using an iterator. The data is constructed as a hash with `image_url`, `page_url`, and `title` properties. This is yielded as a block argument.
# File lib/StarTrekCLI/scraper.rb, line 13 def each_index_group(index_url) # FIXME: there might be a ghost tbody hanging around doc = Nokogiri::HTML(open(index_url)) table = doc.css("tbody") row1 = table.css("tr")[0].css("td a img") # images row2 = table.css("tr")[1].css("a") # links # works like a do loop (0..3).each do |column| cell_group = { :image_url => row1[column].attr("src"), :page_url => row2[column].attr("href"), :title => row2[column].children.first.content.strip } yield cell_group end end
each_series_page(series_url) { |episode_row| ... }
click to toggle source
This method pulls information from the scraped series using an iterator. The data is constructed as a hash with `episode_name`, `star_date`, and `air_date` properties. This is yielded as a block argument.
# File lib/StarTrekCLI/scraper.rb, line 36 def each_series_page(series_url) html_source = open(series_url) doc = Nokogiri::HTML(html_source) sub_tables = doc.css("body > table table") # FIXME: seasons will have some problems like 101 + 102 or the animated # series as season "4" if sub_tables.empty? doc.css("body > div table").each_with_index do |table, index| rows = table.css("tr") rows.shift rows.each do |row| link = row.css("td")[0].css("a") episode_row = { :season_number => index + 1, :episode_name => link.children.text.strip, :episode_url => link.attr("href").value, # I am using to_i to truncate any second set of production numbers :production_number => row.css("td")[1].text.strip.to_i.to_s, } yield episode_row end # rows.each end # body > div table else # sub_tables is not empty sub_tables.each_with_index do |table, index| rows = table.css("tr") rows.shift rows.each do |row| link = row.css("td").css("a") episode_row = { :season_number => index + 1, :episode_name => link.children.text.strip, :episode_url => link.attr("href").value, :production_number => row.css("td")[1].text.strip.to_i.to_s, } yield episode_row end # rows.each end # subtables.each end # if / else end
episode_page_header(episode_url) { |episode_stuff| ... }
click to toggle source
This method pulls information from the scraped episodes using an iterator. The data is constructed as a hash with `episode_name`, `star_date`, and `air_date` properties. This is yielded as a block argument.
# File lib/StarTrekCLI/scraper.rb, line 83 def episode_page_header(episode_url) html_source = open(episode_url) doc = Nokogiri::HTML(html_source) header = doc.css("body > p").first dates = header.children.last.text.match(DATE_REGEX) # the star trek pilot does *not* have any dates... it will return # nil to the user. episode_stuff = { :episode_name => header.css("b").text.strip, :star_date => dates ? dates[1].strip : nil, :air_date => dates ? dates[2].strip : nil } yield episode_stuff end