class WebPageParser::HTTP::Session

Public Instance Methods

curl() click to toggle source
# File lib/web-page-parser/http.rb, line 19
def curl
  @curl ||= Curl::Easy.new do |c|
    c.timeout = 8
    c.connect_timeout = 8
    c.dns_cache_timeout = 600
    c.enable_cookies = true
    c.follow_location = true
    c.max_redirects = 6
    c.autoreferer = true
    c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
    c.headers["Accept-encoding"] = 'gzip, deflate'
  end
end
get(url) click to toggle source
# File lib/web-page-parser/http.rb, line 33
def get(url)
  curl.url = url
  if curl.perform == false
    raise CurlError, "curl.perform returned false"
  end
  uncompressed = gunzip(curl.body_str)
  uncompressed = inflate(curl.body_str) if uncompressed.nil?
  final_body = uncompressed || curl.body_str
  if final_body.respond_to?(:force_encoding)
    # Not sure if this is right. works for BBC/Guardian/New York Times anyway
    final_body.force_encoding("utf-8")
  end
  Response.new(final_body, curl)
end
gunzip(s) click to toggle source
# File lib/web-page-parser/http.rb, line 54
def gunzip(s)
  s = StringIO.new(s)
  Zlib::GzipReader.new(s).read
rescue Zlib::DataError
rescue Zlib::GzipFile::Error
  nil
end
inflate(s) click to toggle source
# File lib/web-page-parser/http.rb, line 48
def inflate(s)
  Zlib::Inflate.inflate(s)
rescue Zlib::DataError
  nil
end