class UncleKryon::KryonAumYearAlbumParser
Attributes
album[RW]
artist[RW]
options[RW]
trainers[RW]
training[RW]
training?[RW]
updated_on[RW]
url[RW]
Public Class Methods
new(artist=nil,url=nil,album: nil,training: false,train_filepath: nil,updated_on: nil, **options)
click to toggle source
# File lib/unclekryon/parsers/kryon_aum_year_album_parser.rb, line 40 def initialize(artist=nil,url=nil,album: nil,training: false,train_filepath: nil,updated_on: nil, **options) @album = album @artist = artist @options = options @updated_on = Util.format_datetime(DateTime.now) if Util.empty_s?(updated_on) @url = url @trainers = Trainers.new(train_filepath) @training = training @trainers['aum_year_album'] = Trainer.new({ 'alds' => 'album_dates', 'altt' => 'album_title', 'allo' => 'album_locations', 'almi' => 'album_mini_desc', 'alma' => 'album_main_desc', 'aust' => 'aum_subtitle', 'aulg' => 'aum_languages', # See 2018 "Montreal QB w/Robert Coxon (3)" aums' subtitles "FRENCH" 'autt' => 'aum_title', 'autm' => 'aum_timespan', 'ausz' => 'aum_filesize', 'aufn' => 'aum_filename', 'audu' => 'dump', 'i' => 'ignore', }) @trainers['aum_year_album_mini_desc'] = Trainer.new({ 'd' => 'date', 'l' => 'location', 's' => 'desc', 'i' => 'ignore', }) end
Public Instance Methods
parse_aums(doc,album)
click to toggle source
# File lib/unclekryon/parsers/kryon_aum_year_album_parser.rb, line 140 def parse_aums(doc,album) links = doc.css('a') return if links.nil? i = 0 # Don't do #each_with_index() because sometimes we next links.each do |link| next if link.nil? audio_file_regex = /\.mp3/i href = link['href'] exclude_links = %r{ files\.kryonespanol\.com/audio/ }ix next if href.nil? || href.empty? next if href !~ audio_file_regex next if href =~ exclude_links aum = AumData.new aum.url = Util.clean_data(href) aum.filename = Util.parse_url_filename(aum.url) aum.updated_on = @updated_on if aum.url =~ %r{\A\.\.?/} aum.url = Util.clean_link(@url,aum.url) end # Filesize if !DevOpts.instance.test? # Getting header data is slow, so only do it when not testing begin r = Util.get_url_header_data(aum.url) aum.filesize = r['content-length'] aum.filesize = aum.filesize[0] if aum.filesize.is_a?(Array) rescue StandardError => e raise e.exception("#{e.message}; couldn't get header data for #{aum.url}") end end # Subtitle if i < @local_dump[:aum_subtitle].length aum.subtitle = @local_dump[:aum_subtitle][i] else log.warn("No subtitle for: #{aum.filename},#{aum.url}") end # Languages aum.languages = @local_dump[:aum_languages][i] if i < @local_dump[:aum_languages].length # Title if i < @local_dump[:aum_title].length aum.title = @local_dump[:aum_title][i] else # Set title to something at least if !(afn = aum.filename).nil? && !afn.strip.empty? # More descriptive than subtitle aum.title = afn.gsub(audio_file_regex,'').strip log.warn("Using filename as title: #{aum.title}") else aum.title = aum.subtitle log.warn("Using subtitle as title: #{aum.title}") end end # Timespan if i < @local_dump[:aum_timespan].length aum.timespan = @local_dump[:aum_timespan][i] else msg = "No timespan for: #{aum.title},#{aum.subtitle},#{aum.filename},#{aum.url}" log.warn(msg) #if DevOpts.instance.dev?() # raise "#{msg}:\n#{@local_dump}\n#{album.dump}" #else # log.warn(msg) #end end # Filesize, if not set if (aum.filesize.nil? || aum.filesize.strip.empty?) && i < @local_dump[:aum_filesize].length aum.filesize = @local_dump[:aum_filesize][i] log.warn("Using local dump filesize: #{aum.filesize}") end i += 1 # Is it old? if album.aums.key?(aum.url) && aum == album.aums[aum.url] aum.updated_on = album.aums[aum.url].updated_on else # New album.updated_on = @updated_on end album.aums[aum.url] = aum end end
parse_dump(doc,album)
click to toggle source
# File lib/unclekryon/parsers/kryon_aum_year_album_parser.rb, line 240 def parse_dump(doc,album) album.dump = [] tds = doc.css('td') return if tds.nil? filename_regex = /\.mp3[[:space:]]*\z/i # 2017 "Petra, Jordan (5)" has a ":" in the megabytes cell size_regex = /\A[[:space:]]*[[:digit:]]+(\.|\:|[[:digit:]]|[[:space:]])*megabytes[[:space:]]*\z/i # 2017 "Monument Valley Tour (11)" has a "." in the minutes cell # 2017 "SUMMER LIGHT CONFERENCE PANEL (1)" is a special case ("One hour 6 minutes - (66 minutes)") time_regex = / \A[[:space:]]*[[:digit:]]+(\:|\.|[[:digit:]]|[[:space:]])*(minutes|Min)[[:space:]]*\z| \([[:space:]]*[[:digit:]]+[[:space:]]+minutes[[:space:]]*\)[[:space:]]*\z /ix # 2017 "KRYON INDIA-NEPAL TOUR PART 1 (10)" doesn't have the word "megabytes" time_or_size_regex = /\A[[:space:]]*[[:digit:]]+(\:|\.|[[:digit:]]|[[:space:]])*\z/i # 2015 ones have a lot of "13:12 Min - 15.9 megs" time_and_size_regex = /\A [[:space:]]*[[:digit:]]+[\:\.][[:digit:]]+ [[:space:]]+Min[[:space:]]+\- [[:space:]]+[[:digit:]]+\.?[[:digit:]]*[[:space:]]*megs /xi size_count = 0 time_count = 0 tds.each do |td| next if td.nil? next if td.content.nil? orig_c = Util.clean_charset(td.content) c = Util.clean_data(orig_c) next if c.empty? #if c =~ exclude_content_regex # log.warn("Excluding content: #{c}") # next #end add_to_dump = true if c =~ time_regex @local_dump[:aum_timespan].push(TimespanData.new(c).to_s) add_to_dump = false time_count += 1 elsif c =~ size_regex @local_dump[:aum_filesize].push(c) add_to_dump = false size_count += 1 elsif c =~ time_or_size_regex # Time is usually before size if time_count == size_count @local_dump[:aum_timespan].push(TimespanData.new(c).to_s) time_count += 1 else @local_dump[:aum_filesize].push(c) size_count += 1 end add_to_dump = false elsif c =~ time_and_size_regex time_and_size = c.split(/[[:space:]]*\-[[:space:]]*/) # Split on '-' @local_dump[:aum_timespan].push(TimespanData.new(time_and_size[0]).to_s) time_count += 1 @local_dump[:aum_filesize].push(time_and_size[1]) size_count += 1 add_to_dump = false elsif c =~ filename_regex @local_dump[:aums] += 1 add_to_dump = false else # Paragraphs pars = orig_c.gsub(/\A[[:space:]]+/,'').gsub(/[[:space:]]+\z/,'') pars = pars.split(/[\r\n\p{Zl}\p{Zp}]{2,}/) pars.each do |par| par = par.gsub(/[[:blank:]]+/,' ').strip par = Util.fix_shortwith_text(par) next if par.empty? if @training if @trainers['aum_year_album'].train(par) == 'album_mini_desc' par.split(/\n+/).each do |p| @trainers['aum_year_album_mini_desc'].train(p) end end else #has_header = @local_dump[:album_title] || @local_dump[:album_dates] || # @local_dump[:album_locations] || @local_dump[:album_mini_desc] || # @local_dump[:album_main_desc] has_header = true tag = @trainers['aum_year_album'].tag(par) # For 2017 "RETURN TO LEMURIA (7)" if par =~ /\A[[:space:]]*MEDITATION[[:space:]]+- [[:space:]]+Kalei[[:space:]]+- [[:space:]]+John[[:space:]]+- [[:space:]]+Amber[[:space:]]*\z/xi tag = 'aum_title' log.warn("Changing tag to aum_title: #{Util.clean_data(par)}") end case tag when 'album_title' if !@local_dump[:album_title] @local_dump[:album_title] = true end when 'album_dates' if !@local_dump[:album_dates] @local_dump[:album_dates] = true end when 'album_locations' if !@local_dump[:album_locations] @local_dump[:album_locations] = true end when 'album_mini_desc' par.split(/\n+/).each do |p| p = Util.clean_data(p) if !p.empty? case @trainers['aum_year_album_mini_desc'].tag(p) when 'desc' if !@local_dump[:album_mini_desc] @local_dump[:album_mini_desc] = true album.mini_desc = p else album.mini_desc << ' | ' if !album.mini_desc.strip.empty? album.mini_desc << p end when 'ignore' log.warn("Excluding mini desc content: #{p}") end end end add_to_dump = false when 'album_main_desc' if !@local_dump[:album_main_desc] @local_dump[:album_main_desc] = true album.main_desc = ''.dup else album.main_desc << "\n\n" if !album.main_desc.strip.empty? end par.split(/\n+/).each do |p| album.main_desc << Util.clean_data(p) << "\n" end album.main_desc = album.main_desc.strip # Remove last newline add_to_dump = false when 'ignore' log.warn("Excluding content: #{Util.clean_data(par)}") add_to_dump = false else if !has_header log.warn("No header yet so ignoring: #{Util.clean_data(par)}") else case tag when 'aum_subtitle' @local_dump[:aum_subtitle].push(Util.clean_data(par)) add_to_dump = false when 'aum_languages' p = Util.clean_data(par) @local_dump[:aum_languages].push(Iso.languages.find_by_kryon(p)) @local_dump[:aum_subtitle].push(p) add_to_dump = false when 'aum_title' @local_dump[:aum_title].push(Util.clean_data(par)) # Special case for 2017 "LISBON, PORTUGAL (Fatima Tour) (3)" if par =~ /\A[[:space:]]*Lisbon[[:space:]]+Channeling[[:space:]]+1[[:space:]]*\z/i @local_dump[:aum_title].push('Lisbon Channeling 2') @local_dump[:aum_title].push('Lisbon Channeling 3') log.warn("Adding aum_titles for: #{Util.clean_data(par)}") end # For 2017 "KRYON INDIA-NEPAL TOUR PART 1 (10)" & "KRYON INDIA-NEPAL TOUR PART 2 (8)" if par =~ /\A[[:space:]]*PAGE[[:space:]]*(ONE|TWO)[[:space:]]*\z/i p = @local_dump[:aum_title].pop log.warn("Ignoring aum title: #{p}") end add_to_dump = false when 'aum_filename' add_to_dump = false end end end end end end if add_to_dump album.dump.push(c) # For now, don't do this; if the font size is big, it's bad for mobile anyway #album.dump.push(Util.clean_data(td.to_s())) # For bold, etc. html end end end
parse_pics(doc,album)
click to toggle source
# File lib/unclekryon/parsers/kryon_aum_year_album_parser.rb, line 444 def parse_pics(doc,album) imgs = doc.css('img') return if imgs.nil? exclude_imgs = / buttonMP3\.png| freedownloadtype\.gif| handani\.gif| Kryonglobe\.jpg| MP3\-download\.jpg| MP3\-graphic\(SM\)\.jpg| NavMenu\_AUDIOmaster\.png| NavMenu\_master\.png| testimonials\.png /ix imgs.each do |img| next if img.nil? src = img['src'] next if src.nil? || src.empty? if src =~ exclude_imgs log.warn("Excluding image: #{src}") next end pic = PicData.new pic.url = Util.clean_link(url,src) pic.filename = Util.parse_url_filename(pic.url) pic.alt = img['alt'] pic.alt = '' if Util.empty_s?(pic.alt) pic.caption = '' pic.name = Util.empty_s?(pic.alt) ? File.basename(pic.filename,File.extname(pic.filename)) : pic.alt pic.updated_on = @updated_on # Is it old? if album.pics.key?(pic.url) && pic == album.pics[pic.url] pic.updated_on = album.pics[pic.url].updated_on else # New album.updated_on = @updated_on end album.pics[pic.url] = pic end end
parse_site(artist=nil,url=nil)
click to toggle source
# File lib/unclekryon/parsers/kryon_aum_year_album_parser.rb, line 74 def parse_site(artist=nil,url=nil) @artist = artist unless artist.nil? @url = url unless url.nil? # URLs that return 404 or are empty; fix by hand exclude_urls = %r{ awakeningzone\.com/Episode\.aspx\?EpisodeID\=| www\.talkshoe\.com/talkshoe/web/audioPop\.jsp\?episodeId\= }ix if @url =~ exclude_urls log.warn("Excluding Album URL #{@url}") return end @trainers.load_file raise ArgumentError,'Artist cannot be nil' if @artist.nil? raise ArgumentError,'URL cannot be empty' if @url.nil? || (@url = @url.strip).empty? # Album data (flags are okay) should never go in this, only for aums, pics, etc. @local_dump = { album_dates: false, album_title: false, album_locations: false, album_mini_desc: false, album_main_desc: false, aums: 0, aum_subtitle: [], aum_languages: [], aum_title: [], aum_timespan: [], aum_filesize: [], aum_filename: [], } # Force 'utf-8' # - See charset "X-MAC-ROMAN" in 2017 "The Discovery Series", 2016 "Kryon in Budapest (5)" doc = Nokogiri::HTML(URI(@url).open,nil,'utf-8') old_album = @artist.albums[@url] @album = old_album.clone @album.updated_on = @updated_on @album.url = @url if old_album.nil? @artist.albums[@url] = @album end parse_dump(doc,@album) # Must be first because other methods rely on @local_dump return @album if @training # Currently, no other training occurs parse_pics(doc,@album) parse_aums(doc,@album) if @album == old_album @album.updated_on = old_album.updated_on end @artist.albums[@url] = @album return @album end