module SpiderHelper
Constants
- BomHeaderMap
Public Class Methods
direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
click to toggle source
# File lib/spider_helper.rb, line 6 def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false) href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI) begin href.query = URI.encode_www_form(params) if params req = Net::HTTP::Get.new(href) header.each { |k, v| req[k] = v } if header res = Net::HTTP.start(href.hostname, href.port) do |http| http.request(req) end if res.is_a?(Net::HTTPSuccess) local_dir = File.dirname(local_path) FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir) content = res.body content = to_utf8(content) if convert_to_utf8 File.write(local_path, content) puts 'succeed' return true else puts res end rescue StandardError => e puts e.backtrace puts e false end false end
direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
click to toggle source
# File lib/spider_helper.rb, line 39 def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false) href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI) begin req = Net::HTTP::Post.new(href) req.set_form_data(params) header.each { |k, v| req[k] = v } if header res = Net::HTTP.start(href.hostname, href.port) do |http| http.request(req) end if res.is_a?(Net::HTTPSuccess) local_dir = File.dirname(local_path) FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir) content = res.body content = to_utf8(content) if convert_to_utf8 File.write(local_path, content) puts 'succeed' return true else puts res end rescue StandardError => e puts e false end false end
extract_href_last(origin_href)
click to toggle source
# File lib/spider_helper.rb, line 71 def extract_href_last(origin_href) origin_href.split('/')[-1] end
smart_to_utf8(str)
click to toggle source
此函数有时此判断有误,使用to_utf8函数直接转换
# File lib/spider_helper.rb, line 89 def smart_to_utf8(str) return str if str.encoding == Encoding::UTF_8 to_utf8(str) end
string_to_uri(href)
click to toggle source
# File lib/spider_helper.rb, line 75 def string_to_uri(href) l = href l.sub!('http:///', 'http://') l = Addressable::URI.parse(l) l.normalize! end
to_utf8(str)
click to toggle source
# File lib/spider_helper.rb, line 94 def to_utf8(str) # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题 str.force_encoding(Encoding::ASCII_8BIT) cd = CharDet.detect(str) if cd['confidence'] > 0.6 puts cd['encoding'] str.force_encoding(cd['encoding']) # 移除BOM头 bom_header = BomHeaderMap[cd['encoding']] str.sub!(bom_header, '') if bom_header end str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace) str end