class BcCrawler::Release

Attributes

about[R]
art_fullsize_url[R]
art_id[R]
art_thumb_url[R]
artist[R]
band_id[R]
credits[R]
data[R]
has_audio[R]
html[R]
id[R]
purchase_url[R]
release_date[R]
title[R]
tracks[R]
type[R]
url[R]

Public Class Methods

new(url) click to toggle source
# File lib/bc_crawler/release.rb, line 9
def initialize(url)
  @url = url
  @tracks = []
end

Public Instance Methods

crawl(nodes = %w(artFullsizeUrl artThumbURL current hasAudio trackinfo url)) click to toggle source

Scan the HTML for a particular JavaScript snippet where a variable named “TralbumData” is assigned. TralbumData contains all information about the release (and its tracks), but has to be cleaned first in order to get a valid JSON object.

By default, only the main nodes in TralbumData are crawled. There are more nodes available.

nodes = %w(album_is_preorder album_release_date artFullsizeUrl artist artThumbURL
           current defaultPrice featured_track_id FREE freeDownloadPage hasAudio
           id initial_track_num is_preorder item_type last_subscription_item
           maxPrice minPrice packages PAID playing_from preorder_count trackinfo url)
# File lib/bc_crawler/release.rb, line 24
def crawl(nodes = %w(artFullsizeUrl artThumbURL current hasAudio trackinfo url))
  puts "Crawling #{@url}"
  @nodes = nodes

  # call the URL, fetch the JavaScript code (TralbumData) and clean the string
  @html = open(@url).read
  js_content = html.gsub(/\n/, '~~')[/var TralbumData = \{(.*?)\};/, 1] # get content of JS variable TralbumData
                   .gsub('~~', "\n")                                  # undo line endings replacement
                   .gsub("\t", '')                                    # remove tabs
                   .gsub("\" + \"", '')                               # special bug in "url" node

  # scan the JavaScript code text for the given nodes
  json_nodes = []
  @nodes.each do |node|
    json_nodes << js_content[/^( )*#{node}( )*:.*$/]                  # fetch current node in JavaScript object
                           .gsub(/#{node}/, "\"#{node}\"")            # add double quotes around node name
                           .gsub(/( )*,( )*$/, '')                    # remove empty lines with comma
  end

  @data = JSON.parse("{ #{ json_nodes.join(', ') } }")

  # Finally, we load the release info
  load_release_info
end
load_release_info() click to toggle source

Assign some of the main information to instance variables TODO: make ALL information available as instance variables

# File lib/bc_crawler/release.rb, line 51
def load_release_info
  @art_fullsize_url   = @data['artFullsizeUrl']
  @art_thumb_url      = @data['artThumbURL']
  @art_id             = @data['current']['art_it']
  @about              = @data['current']['about']
  @featured_track_id  = @data['current']['featured_track_id']
  @credits            = @data['current']['credits']
  @artist             = @data['current']['artist']
  @purchase_url       = @data['current']['purchase_url']
  @band_id            = @data['current']['band_id']
  @id                 = @data['current']['id']
  @release_date       = @data['current']['release_date']
  @type               = @data['current']['type']
  @title              = @data['current']['title']
  @has_audio          = @data['hasAudio']
  load_track_info
end
load_track_info() click to toggle source

Tracks have their own class

# File lib/bc_crawler/release.rb, line 70
def load_track_info
  @data['trackinfo'].each do |track|
    @tracks << Track.new(self, track)
  end
end
to_s() click to toggle source
# File lib/bc_crawler/release.rb, line 76
    def to_s
      <<-EOF
      URL : #{ @url }
      Artist : #{ @artist }
      Release title : #{ @title }
      Number of tracks : #{ @tracks.count }
      #{ '(use .crawl method to fetch the missing data)' if @artist.nil? }
      EOF
    end