class Alexandria::BookProviders::WorldCatProvider

Constants

BASE_SEARCH_URL
SITE

Public Class Methods

new() click to toggle source
# File lib/alexandria/book_providers/worldcat.rb, line 30
def initialize
  super("WorldCat", "WorldCat")
  prefs.read
end

Public Instance Methods

url(book) click to toggle source
# File lib/alexandria/book_providers/worldcat.rb, line 49
def url(book)
  create_search_uri(SEARCH_BY_ISBN, book.isbn)
end

Private Instance Methods

create_search_uri(search_type, search_term) click to toggle source
# File lib/alexandria/book_providers/worldcat.rb, line 55
def create_search_uri(search_type, search_term)
  (search_type_code = { SEARCH_BY_ISBN    => "isbn:",
                        SEARCH_BY_AUTHORS => "au:",
                        SEARCH_BY_TITLE   => "ti:",
                        SEARCH_BY_KEYWORD => "" }[search_type]) || ""
  search_type_code = CGI.escape(search_type_code)
  search_term_encoded = if search_type == SEARCH_BY_ISBN
                          Library.canonicalise_ean(search_term) # isbn-13
                        else
                          CGI.escape(search_term)
                        end
  format(BASE_SEARCH_URL, search_type_code, search_term_encoded)
end
get_book_from_search_result(result) click to toggle source
# File lib/alexandria/book_providers/worldcat.rb, line 69
def get_book_from_search_result(result)
  log.debug { "Fetching book from #{result[:url]}" }
  html_data = transport.get_response(URI.parse(result[:url]))
  parse_result_data(html_data.body)
end
parse_result_data(html, search_isbn = nil, recursing = false) click to toggle source
# File lib/alexandria/book_providers/worldcat.rb, line 106
def parse_result_data(html, search_isbn = nil, recursing = false)
  doc = html_to_doc(html, "UTF-8")

  begin
    if doc % "div#div-results-none"
      log.debug { "WorldCat reports no results" }
      raise NoResultsError
    end

    if doc % "table.table-results"
      if recursing
        log.warn { "Infinite loop prevented redirecting through WorldCat" }
        raise NoResultsError
      end
      log.info { "Found multiple results for lookup: checking each" }
      search_results = parse_search_result_data(html)
      book = nil
      cover_url = nil
      first_result = nil
      search_results.each do |rslt|
        # rslt = search_results.rslt
        log.debug { "checking #{rslt[:url]}" }
        rslt2 = transport.get_response(URI.parse(rslt[:url]))
        html2 = rslt2.body

        book, cover_url = parse_result_data(html2, search_isbn, true)

        log.debug { "got book #{book}" }

        return [book, cover_url] unless search_isbn

        first_result = [book, cover_url] if first_result.nil?

        search_isbn_canon = Library.canonicalise_ean(search_isbn)
        rslt_isbn_canon = Library.canonicalise_ean(book.isbn)
        if search_isbn_canon == rslt_isbn_canon
          log.info { "book #{book} is a match" }
          return [book, cover_url]
        end
        log.debug { "not a match, checking next" }
      end

      # gone through all and no ISBN match, so just return first result
      log.info do
        "no more results to check. Returning first result, just an approximation"
      end
      return first_result
    end

    title_header = doc % "h1.title"
    title = title_header.inner_text if title_header
    unless title
      log.warn { "Unexpected lack of title from WorldCat lookup" }
      raise NoResultsError
    end
    log.info { "Found book #{title} at WorldCat" }

    authors = []
    authors_tr = doc % "tr#details-allauthors"
    if authors_tr
      (authors_tr / :a).each do |a|
        authors << a.inner_text
      end
    end

    # can we do better? get the City name?? or multiple publishers?
    bibdata = doc % "div#bibdata"
    bibdata_table = bibdata % :table
    publisher_header = bibdata_table % "th[text()*=Publisher]"

    if publisher_header
      publisher_row = publisher_header.parent
      publication_info = (publisher_row / "td").last.inner_text

      publication_info =~ if publication_info.index(";")
                            /;\s*([^\d]+)\s*\d*/
                          elsif publication_info.index(":")
                            /:\s*([^;:,]+)/
                          else
                            /([^;,]+)/
                          end

      publisher = Regexp.last_match[1]
      publication_info =~ /([12][0-9]{3})/
      year = Regexp.last_match[1].to_i if Regexp.last_match[1]
    else
      publisher = nil
      year = nil
    end

    isbn_row = doc % "tr#details-standardno"
    if isbn_row
      isbns = (isbn_row / "td").last.inner_text.split
      isbn = Library.canonicalise_isbn(isbns.first)
    else
      log.warn { "No ISBN found on page" }
      isbn = search_isbn
    end

    book_binding = "" # not given on WorldCat website (as far as I can tell)

    book = Book.new(title, authors, isbn, publisher, year, book_binding)

    image_url = nil # hm, it's on the website, but uses JavaScript...

    [book, image_url]
  rescue StandardError => ex
    raise ex if ex.instance_of? NoResultsError

    trace = ex.backtrace.join("\n> ")
    log.warn do
      "Failed parsing search results for WorldCat " \
        "#{ex.message} #{trace}"
    end
    raise NoResultsError
  end
end
parse_search_result_data(html) click to toggle source
# File lib/alexandria/book_providers/worldcat.rb, line 75
def parse_search_result_data(html)
  doc = html_to_doc(html, "UTF-8")
  book_search_results = []
  begin
    result_divs = doc / "td.result/div.name"
    result_divs.each do |div|
      td = div.parent
      type_icon = td % "div.type/img.icn"
      next unless type_icon && type_icon["src"].include?("icon-bks")

      name_div = td % "div.name"
      title = name_div.inner_text
      anchor = name_div % :a
      url = anchor["href"] if anchor
      lookup_url = "#{SITE}#{url}"
      result = {}
      result[:title] = title
      result[:url] = lookup_url

      book_search_results << result
    end
  rescue StandardError => ex
    trace = ex.backtrace.join("\n> ")
    log.warn do
      "Failed parsing search results for WorldCat " \
        "#{ex.message} #{trace}"
    end
  end
  book_search_results
end