class BcCrawler::Release
Attributes
about[R]
art_fullsize_url[R]
art_id[R]
art_thumb_url[R]
artist[R]
band_id[R]
credits[R]
data[R]
featured_track_id[R]
has_audio[R]
html[R]
id[R]
purchase_url[R]
release_date[R]
title[R]
tracks[R]
type[R]
url[R]
Public Class Methods
new(url)
click to toggle source
# File lib/bc_crawler/release.rb, line 9 def initialize(url) @url = url @tracks = [] end
Public Instance Methods
crawl(nodes = %w(artFullsizeUrl artThumbURL current hasAudio trackinfo url))
click to toggle source
Scan the HTML for a particular JavaScript snippet where a variable named “TralbumData” is assigned. TralbumData contains all information about the release (and its tracks), but has to be cleaned first in order to get a valid JSON object.
By default, only the main nodes in TralbumData are crawled. There are more nodes available.
nodes = %w(album_is_preorder album_release_date artFullsizeUrl artist artThumbURL current defaultPrice featured_track_id FREE freeDownloadPage hasAudio id initial_track_num is_preorder item_type last_subscription_item maxPrice minPrice packages PAID playing_from preorder_count trackinfo url)
# File lib/bc_crawler/release.rb, line 24 def crawl(nodes = %w(artFullsizeUrl artThumbURL current hasAudio trackinfo url)) puts "Crawling #{@url}" @nodes = nodes # call the URL, fetch the JavaScript code (TralbumData) and clean the string @html = open(@url).read js_content = html.gsub(/\n/, '~~')[/var TralbumData = \{(.*?)\};/, 1] # get content of JS variable TralbumData .gsub('~~', "\n") # undo line endings replacement .gsub("\t", '') # remove tabs .gsub("\" + \"", '') # special bug in "url" node # scan the JavaScript code text for the given nodes json_nodes = [] @nodes.each do |node| json_nodes << js_content[/^( )*#{node}( )*:.*$/] # fetch current node in JavaScript object .gsub(/#{node}/, "\"#{node}\"") # add double quotes around node name .gsub(/( )*,( )*$/, '') # remove empty lines with comma end @data = JSON.parse("{ #{ json_nodes.join(', ') } }") # Finally, we load the release info load_release_info end
load_release_info()
click to toggle source
Assign some of the main information to instance variables TODO: make ALL information available as instance variables
# File lib/bc_crawler/release.rb, line 51 def load_release_info @art_fullsize_url = @data['artFullsizeUrl'] @art_thumb_url = @data['artThumbURL'] @art_id = @data['current']['art_it'] @about = @data['current']['about'] @featured_track_id = @data['current']['featured_track_id'] @credits = @data['current']['credits'] @artist = @data['current']['artist'] @purchase_url = @data['current']['purchase_url'] @band_id = @data['current']['band_id'] @id = @data['current']['id'] @release_date = @data['current']['release_date'] @type = @data['current']['type'] @title = @data['current']['title'] @has_audio = @data['hasAudio'] load_track_info end
load_track_info()
click to toggle source
Tracks have their own class
# File lib/bc_crawler/release.rb, line 70 def load_track_info @data['trackinfo'].each do |track| @tracks << Track.new(self, track) end end
to_s()
click to toggle source
# File lib/bc_crawler/release.rb, line 76 def to_s <<-EOF URL : #{ @url } Artist : #{ @artist } Release title : #{ @title } Number of tracks : #{ @tracks.count } #{ '(use .crawl method to fetch the missing data)' if @artist.nil? } EOF end