class DomainsScanner::Crawlers::Baidu
Public Instance Methods
host()
click to toggle source
# File lib/domains_scanner/crawlers/baidu.rb, line 4 def host "https://www.baidu.com" end
keyword_field_name()
click to toggle source
# File lib/domains_scanner/crawlers/baidu.rb, line 8 def keyword_field_name "wd" end
next_page_link_selector()
click to toggle source
# File lib/domains_scanner/crawlers/baidu.rb, line 32 def next_page_link_selector "#page strong+a" end
parse_results(doc)
click to toggle source
- {title: “xxx”, url: “xxx”}, …
# File lib/domains_scanner/crawlers/baidu.rb, line 13 def parse_results(doc) items = doc.search(".result") items.map do |i| title = i.search("h3.t > a").text # Baidu encrypted the target url, so we can use show url only, but it is enough! # bbs.abc.net/for...php?... show_url = i.search("div:last-child > a.c-showurl") url = if show_url if show_url.text.start_with?("http") show_url.text else "http://#{show_url.text}" end end { title: i.text, url: URI.encode(url) } end end