class InstagramCrawler::Parser::Html
Attributes
html[R]
Public Class Methods
new(url)
click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 6 def initialize(url) @html = get_html(url) end
Public Instance Methods
parsing()
click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 10 def parsing doc = Nokogiri::HTML(html) js_data = doc.at_xpath("//script[contains(text(),'window._sharedData')]") json = JSON.parse(js_data.text[21..-2]) profile = json["entry_data"]["ProfilePage"][0] page_info = profile["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info'] user_id = profile["logging_page_id"].delete("profilePage_") edges = profile["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] loop_edges(edges) return page_info, user_id end
parsing_photo_page()
click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 30 def parsing_photo_page doc = Nokogiri::HTML(html) js_data = doc.at_xpath("//script[contains(text(),'window._sharedData')]") json = JSON.parse(js_data.text[21..-2]) shortcode_media = json["entry_data"]["PostPage"][0]["graphql"]["shortcode_media"] if shortcode_media["edge_sidecar_to_children"] shortcode_media["edge_sidecar_to_children"]["edges"] else shortcode_media["display_url"] end end
parsing_video_page()
click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 24 def parsing_video_page doc = Nokogiri::HTML(html) meta_v = doc.at_xpath("//meta[@property='og:video']") url = meta_v.attribute_nodes.last.value end
Private Instance Methods
get_html(url)
click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 73 def get_html(url) res = Config.proxyname ? HTTP.via(Config.proxyname, Config.port).get(url) : HTTP.get(url) raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200 res.to_s end
loop_edges(edges)
click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 45 def loop_edges(edges) edges.each do |edge| node = edge["node"] next if Config.before_date && (Config.parse_before_date < node["taken_at_timestamp"]) check_after_time(node["taken_at_timestamp"]) time = parse_to_date(node["taken_at_timestamp"]) page_url = "https://www.instagram.com/p/#{node["shortcode"]}/" if node["is_video"] Logger.info "========VIDEO========".light_yellow url = Html.new(page_url).parsing_video_page output(time, url) File.download(url, 'video', time) else shortcode_media = Html.new(page_url).parsing_photo_page if shortcode_media.is_a? Array Logger.info "========POST========".light_magenta parse_post(shortcode_media, time) else Logger.info "========PHOTO========".light_green url = shortcode_media output(time, url) File.download(url, 'photo', time) end end end end