module ScrapeHelper
Scrapes below data @Title @Descriptions @Social Profiles @Contact Details
Public Instance Methods
perform_scrape(url, read_timeout, open_timeout)
click to toggle source
# File lib/scraper/scrape_helper.rb, line 9 def perform_scrape(url, read_timeout, open_timeout) timeout_in_sec = scraper_timeout(read_timeout, open_timeout) Timeout::timeout(timeout_in_sec) do response = ScrapeRequest.new(url, read_timeout, open_timeout) retry_count = 0 body = response.body begin body = body.tr("\000", '') encoding = body.detect_encoding[:encoding] body = body.encode('UTF-8', encoding) grep_data(body) rescue Encoding::UndefinedConversionError, ArgumentError => e retry_count += 1 raise WebScraper::ParserError, e.message if retry_count > 1 body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') retry rescue Encoding::CompatibilityError => e raise WebScraper::ParserError, e.message rescue StandardError => e raise WebScraper::RequestError, e.message end end rescue Timeout::Error => e raise WebScraper::TimeoutError, e.message end
Private Instance Methods
grep_data(response)
click to toggle source
# File lib/scraper/scrape_helper.rb, line 41 def grep_data(response) { title: grep_title(response), meta_description: grep_meta_description(response), org_description: grep_org_description(response), twitter_description: grep_twitter_description(response), twitter_profile: grep_twitter_profile(response), linkedin_profile: grep_linkedin_profile(response), facebook_profile: grep_facebook_profile(response), instagram_profile: grep_instagram_profile(response), vimeo_profile: grep_vimeo_profile(response), pinterest_profile: grep_pinterest_profile(response), youtube_channel: grep_youtube_channel(response), emails: grep_emails(response), phone_numbers: grep_phone_numbers(response), redirected_to: grep_redirected_to_url(response) } end
scraper_timeout(read_timeout, open_timeout)
click to toggle source
# File lib/scraper/scrape_helper.rb, line 37 def scraper_timeout(read_timeout, open_timeout) ( read_timeout + open_timeout + 1 ) end