class Newrank

Public Instance Methods

_md5(str) click to toggle source

use js md5 algorightm, written by newrank, file in assets/newrank_md5.js

# File lib/newrank.rb, line 140
def _md5(str)
  js_context.call('newrank_md5', str, bare: true)
end
crawl(newrank_id) click to toggle source

crawl newrank info

# File lib/newrank.rb, line 10
def crawl(newrank_id)
  doc = document(newrank_id.gsub("\u{a0}",""))
  if !doc.nil?
    score, uuid = score_and_uuid(doc)

    element = doc.css(".detail-fans-counts")[0]
    active_users_count = element.nil? ? 0 : element.text.gsub(",","").to_i

    element = doc.css(".info-detail-head-weixin-fun-introduce")[0]
    introduce = element.nil? ? "" : element.text

    week_data = week_data(doc)
    if !uuid.nil?
      posts_data = fetch_post(uuid)
    end
    {
      active_users_count: active_users_count,
      score: (score || 0),
      introduce: introduce,
      week_data: week_data,
      posts_data: (posts_data || {})
    }
  else
    {
      active_users_count: 0,
      score: 0,
      introduce: "",
      week_data: [],
      posts_data: {}
    }
  end
end
document(newrank_account) click to toggle source

get Nogogiri Document

# File lib/newrank.rb, line 70
def document(newrank_account)
  wait_for_seconds
  url = 'http://www.newrank.cn/public/info/detail.html?account=' + newrank_account
  Nokogiri::HTML(open(url, "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", :read_timeout => 10), nil, 'utf-8')
end
fetch_post(uuid) click to toggle source

crawl posts

# File lib/newrank.rb, line 44
def fetch_post(uuid)
  nonce = gen_nonce
              xyz = gen_xyz(nonce, uuid)

  wait_for_seconds

  posts = JSON.parse(RestClient.post("http://www.newrank.cn/xdnphb/detail/getAccountArticle", {uuid: uuid, nonce: nonce, xyz: xyz, flag: true}, {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"}))
end
gen_nonce() click to toggle source

generate parameter nonce

# File lib/newrank.rb, line 117
    def gen_nonce
            a = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a","b", "c", "d", "e", "f"]
            b = 0
            while 500 > b
                    d = 0
                    c = ""
                    while 9 > d
                            e = (16 * rand).floor
                            c << a[e]
                            d = d + 1
                    end
                    b = b + 1
            end
c
    end
gen_xyz(nonce, uuid) click to toggle source

generate parameter xyz

# File lib/newrank.rb, line 134
    def gen_xyz(nonce, uuid)
h = "/xdnphb/detail/getAccountArticle?AppKey=joker&flag=true&uuid=#{uuid}&nonce=#{nonce}"
      _md5(h)
    end
js_context() click to toggle source

js context

# File lib/newrank.rb, line 145
def js_context
  file_path = File.join( File.dirname(__FILE__), 'assets/newrank_md5.js')
  @context ||= ExecJS.compile(File.read(file_path))
end
score_and_uuid(doc) click to toggle source

find score and uuid

# File lib/newrank.rb, line 77
def score_and_uuid(doc)
  score, uuid = nil

  script = doc.css("script[type='text/javascript']")[0]
  if !script.nil?
    parser = RKelly::Parser.new
    ast = parser.parse(script.text.strip)

    # 找到第一个数组节点
    array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first

    # 找到数组节点内地第一个Element Node并寻找Score
    element_node = array_node.pointcut(RKelly::Nodes::ElementNode).matches.first
    json_data = element_node.nil? ? {} : JSON.parse(element_node.to_ecma)
    if json_data["new_rank_index_mark"]
      score = json_data["new_rank_index_mark"].to_f
    else
      score = 0.0
    end

    # 找到有UUID的Node
    object_node = ast.pointcut(RKelly::Nodes::VarDeclNode).matches.select{|node| node.name == "fgkcdg"}.first
    unless object_node.nil?
      node = object_node.pointcut(RKelly::Nodes::PropertyNode).matches.select{|n| n.name == '"uuid"'}.first.value
      uuid = node.value[1..-2]
    else
      uuid = "uuid nil"
    end
  end

  return score, uuid
end
wait_for_seconds() click to toggle source

wait for seconds instead of request too much

# File lib/newrank.rb, line 112
def wait_for_seconds
              sleep(1 * rand + 1)
      end
week_data(doc) click to toggle source

crawl week data

# File lib/newrank.rb, line 54
def week_data(doc)
  data = []

  if !doc.css("script")[0].nil?
    parser = RKelly::Parser.new
    ast = parser.parse(doc.css("script")[0].text.strip)
    array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
    array_node.pointcut(RKelly::Nodes::ElementNode).matches.each do |element_node|
      data << JSON.parse(element_node.to_ecma)
    end
  end

  data
end