class UKPlanningScraper::Authority
Attributes
name[R]
url[R]
Public Class Methods
all()
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 84 def self.all @@authorities end
load()
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 122 def self.load # Don't run this method more than once return unless @@authorities.empty? CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \ 'authorities.csv'), :headers => true) do |line| auth = Authority.new(line['authority_name'], line['url']) if line['tags'] auth.add_tags(line['tags'].split(/\s+/)) end auth.add_tag(auth.system) @@authorities << auth end end
named(name)
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 95 def self.named(name) authority = @@authorities.find { |a| name == a.name } raise AuthorityNotFound if authority.nil? authority end
new(name, url)
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 9 def initialize(name, url) @name = name.strip @url = url.strip @tags = [] # Strings in arbitrary order @applications = [] # Application objects @scrape_params = {} end
not_tagged(tag)
click to toggle source
Not tagged x
# File lib/uk_planning_scraper/authority.rb, line 109 def self.not_tagged(tag) found = [] @@authorities.each { |a| found << a unless a.tagged?(tag) } found end
tagged(tag)
click to toggle source
Tagged x
# File lib/uk_planning_scraper/authority.rb, line 102 def self.tagged(tag) found = [] @@authorities.each { |a| found << a if a.tagged?(tag) } found end
untagged()
click to toggle source
Authorities with no tags
# File lib/uk_planning_scraper/authority.rb, line 116 def self.untagged found = [] @@authorities.each { |a| found << a if a.tags.empty? } found end
Public Instance Methods
add_tag(tag)
click to toggle source
Add a single tag to existing tags
# File lib/uk_planning_scraper/authority.rb, line 61 def add_tag(tag) clean_tag = tag.strip.downcase.gsub(' ', '') @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates end
applicant_name(s)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 54 def applicant_name(s) unless system == 'idox' raise NoMethodError.new("applicant_name is only implemented for Idox. \ This authority (#{@name}) is #{system.capitalize}.") end check_class(s, String) @scrape_params[:applicant_name] = s.strip self end
application_type(s)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 76 def application_type(s) unless system == 'idox' raise NoMethodError.new("application_type is only implemented for \ Idox. This authority (#{@name}) is #{system.capitalize}.") end check_class(s, String) @scrape_params[:application_type] = s.strip self end
case_officer_code(s)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 65 def case_officer_code(s) unless system == 'northgate' raise NoMethodError.new("case_officer_code is only implemented for Northgate. \ This authority (#{@name}) is #{system.capitalize}.") end check_class(s, String) @scrape_params[:case_officer_code] = s.strip self end
decided_days(n)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 40 def decided_days(n) # decided within the last n days # Assumes that every scraper/system can do a date range search check_class(n, Fixnum) unless n > 0 raise ArgumentError.new("decided_days must be greater than 0") end decided_from(Date.today - (n - 1)) decided_to(Date.today) self end
development_type(s)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 87 def development_type(s) unless system == 'idox' raise NoMethodError.new("development_type is only implemented for \ Idox. This authority (#{@name}) is #{system.capitalize}.") end check_class(s, String) @scrape_params[:development_type] = s.strip self end
received_days(n)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 26 def received_days(n) # received within the last n days # Assumes that every scraper/system can do a date range search check_class(n, Fixnum) unless n > 0 raise ArgumentError.new("received_days must be greater than 0") end received_from(Date.today - (n - 1)) received_to(Date.today) self end
scrape(options = {})
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 17 def scrape(options = {}) default_options = { delay: 10, } # The user-supplied options override the defaults options = default_options.merge(options) # Select which scraper to use case system when 'idox' @applications = scrape_idox(@scrape_params, options) when 'northgate' @applications = scrape_northgate(@scrape_params, options) else raise SystemNotSupported.new("Planning system not supported for \ #{@name} at URL: #{@url}") end # Post processing @applications.each do |app| app.authority_name = @name end # Output as an array of hashes output = [] # FIXME - silently ignores invalid apps. How should we handle them? @applications.each { |app| output << app.to_hash if app.valid? } # Reset so that old params don't get used for new scrapes clear_scrape_params output # Single point of successful exit end
status(s)
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 98 def status(s) check_class(s, String) @scrape_params[:status] = s.strip self end
system()
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 70 def system if @url.match(/search\.do\?action=advanced/i) 'idox' elsif @url.match(/\.aspx/i) 'northgate' elsif @url.match(/ocellaweb/i) 'ocellaweb' elsif @url.match(/\/apas\//) 'agileplanning' else 'unknownsystem' end end
tagged?(tag)
click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 66 def tagged?(tag) @tags.include?(tag) end
validated_days(n)
click to toggle source
Parameter methods for Authority#scrape
Desgined to be method chained, eg:
applications = UKPlanningScraper::Authority.named
(“Barnet”). \ development_type
(“Q22”).keywords(“illuminat”). \ validated_days
(30).scrape
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 12 def validated_days(n) # Validated within the last n days # Assumes that every scraper/system can do a date range search check_class(n, Fixnum) unless n > 0 raise ArgumentError.new("validated_days must be greater than 0") end validated_from(Date.today - (n - 1)) validated_to(Date.today) self end
Private Instance Methods
check_class( param_value, expected_class, param_name = caller_locations(1, 1)[0].label)
click to toggle source
stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 141 def check_class( param_value, expected_class, param_name = caller_locations(1, 1)[0].label) # name of calling method unless param_value.class == expected_class raise TypeError.new("#{param_name} must be a " \ "#{expected_class} not a #{param_value.class.to_s}") end end
clear_scrape_params()
click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 136 def clear_scrape_params @scrape_params = {} end
method_missing(method_name, *args)
click to toggle source
Handle the simple params with this
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 107 def method_missing(method_name, *args) sc_params = { validated_from: Date, validated_to: Date, received_from: Date, received_to: Date, decided_from: Date, decided_to: Date, keywords: String } value = args[0] if sc_params[method_name] check_class(value, sc_params[method_name], method_name.to_s) value.strip! if value.class == String if value.class == Date && value > Date.today raise ArgumentError.new("#{method_name} can't be a date in the " + \ "future (#{value.to_s})") end @scrape_params[method_name] = value self else raise NoMethodError.new(method_name.to_s) end end
scrape_idox(params, options)
click to toggle source
# File lib/uk_planning_scraper/idox.rb, line 7 def scrape_idox(params, options) puts "Using Idox scraper." base_url = @url.match(/(https?:\/\/.+?)\//)[1] apps = [] agent = Mechanize.new puts "Getting: #{@url}" page = agent.get(@url) # load the search form page # Check that the search form is actually present. # When Idox has an internal error it returns an error page with HTTP 200. unless form = page.form('searchCriteriaForm') puts "Error: Search form page failed to load due to Idox internal error." return [] end # form.action = form.action + '&searchCriteria.resultsPerPage=100' # Fill out and submit search form # Add expected fields to form if they're not already present so that searches using these terms work %w{ date(applicationReceivedStart) date(applicationReceivedEnd) }.each { |f| form.add_field!(f) unless form.has_field?(f) } date_format = "%d/%m/%Y" form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from] form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to] form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from] form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to] form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from] form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to] form.send(:"searchCriteria\.description", params[:keywords]) form.send(:"searchCriteria\.caseStatus", params[:status]) # Some councils don't have the applicant name on their form, eg Bexley form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' page = form.submit if page.search('.errors').inner_text.match(/Too many results found/i) raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.") end loop do # Parse search results items = page.search('li.searchresult') puts "Found #{items.size} apps on this page." items.each do |app| data = Application.new # Parse info line info_line = app.at("p.metaInfo").inner_text.strip bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } bits.each do |bit| if matches = bit.match(/Ref\. No:\s+(.+)/) data.council_reference = matches[1] end if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) data.date_received = Date.parse(matches[2]) end if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) data.date_validated = Date.parse(matches[1]) end if matches = bit.match(/Status:\s+(.+)/) data.status = matches[1] end end data.scraped_at = Time.now data.info_url = base_url + app.at('a')['href'] data.address = app.at('p.address').inner_text.strip data.description = app.at('a').inner_text.strip apps << data end # Get the Next button from the pager, if there is one if next_button = page.at('a.next') next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' sleep options[:delay] puts "Getting: #{next_url}" page = agent.get(next_url) else break end end # Scrape the summary tab for each app apps.each_with_index do |app, i| sleep options[:delay] puts "#{i + 1} of #{apps.size}: #{app.info_url}" res = agent.get(app.info_url) if res.code == '200' # That's a String not an Integer, ffs # Parse the summary tab for this app app.scraped_at = Time.now # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) # Bradford has #tab_documents but without the document count on it app.documents_count = 0 if documents_link = res.at('.associateddocument a') if documents_link.inner_text.match(/\d+/) app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i app.documents_url = base_url + documents_link[:href] end elsif documents_link = res.at('#tab_documents') if documents_link.inner_text.match(/\d+/) app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i app.documents_url = base_url + documents_link[:href] end end # We need to find values in the table by using the th labels. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. res.search('#simpleDetailsTable tr').each do |row| key = row.at('th').inner_text.strip value = row.at('td').inner_text.strip case key when 'Reference' app.council_reference = value when 'Alternative Reference' app.alternative_reference = value unless value.empty? when 'Planning Portal Reference' app.alternative_reference = value unless value.empty? when 'Application Received' app.date_received = Date.parse(value) if value.match(/\d/) when 'Application Registered' app.date_received = Date.parse(value) if value.match(/\d/) when 'Application Validated' app.date_validated = Date.parse(value) if value.match(/\d/) when 'Address' app.address = value unless value.empty? when 'Proposal' app.description = value unless value.empty? when 'Status' app.status = value unless value.empty? when 'Decision' app.decision = value unless value.empty? when 'Decision Issued Date' app.date_decision = Date.parse(value) if value.match(/\d/) when 'Appeal Status' app.appeal_status = value unless value.empty? when 'Appeal Decision' app.appeal_decision = value unless value.empty? else puts "Error: key '#{key}' not found" end # case end # each row else puts "Error: HTTP #{res.code}" end # if end # scrape summary tab for apps apps end
scrape_northgate(params, options)
click to toggle source
# File lib/uk_planning_scraper/northgate.rb, line 8 def scrape_northgate(params, options) puts "Using Northgate scraper." base_url = @url.match(/(https?:\/\/.+?)\//)[1] # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? generic_url = @url.match(/.+\//)[0] + 'Generic/' apps = [] $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. logger = Logger.new($stdout) logger.level = Logger::DEBUG date_regex = /\d{2}-\d{2}-\d{4}/ form_vars = { 'csbtnSearch' => 'Search' # required } # Keywords form_vars['txtProposal'] = params[:keywords] # Date received from and to if params[:received_from] || params[:received_to] form_vars['cboSelectDateValue'] = 'DATE_RECEIVED' form_vars['rbGroup'] = 'rbRange' form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD end # Date validated from and to if params[:validated_from] || params[:validated_to] form_vars['cboSelectDateValue'] = 'DATE_VALID' form_vars['rbGroup'] = 'rbRange' form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD end # Date decided from and to if params[:decided_from] || params[:decided_to] form_vars['cboSelectDateValue'] = 'DATE_DECISION' form_vars['rbGroup'] = 'rbRange' form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD end # Status if params[:status] form_vars['cboStatusCode'] = params[:status] end # Case officer code if params[:case_officer_code] form_vars['cboCaseOfficerCode'] = params[:case_officer_code] @url.sub!('GeneralSearch.aspx', 'CaseOfficerWorkloadSearch.aspx') end logger.info "Form variables: #{form_vars.to_s}" headers = { 'Origin' => base_url, 'Referer' => @url, } logger.debug "HTTP request headers:" logger.debug(headers.to_s) logger.debug "GET: " + @url response = HTTP.headers(headers).get(@url) logger.debug "Response code: HTTP " + response.code.to_s if response.code == 200 doc = Nokogiri::HTML(response.to_s) asp_vars = { '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'], '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value'] } else logger.fatal "Bad response from search page. Response code: #{response.code.to_s}." raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.") end cookies = {} response.cookies.each { |c| cookies[c.name] = c.value } form_vars.merge!(asp_vars) logger.debug "POST: " + @url response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars) logger.debug "Response code: HTTP " + response2.code.to_s if response2.code == 302 # Follow the redirect manually # Set the page size (PS) to max so we don't have to page through search results logger.debug "Location: #{response2.headers['Location']}" results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) logger.debug "GET: " + results_url response3 = HTTP.headers(headers).cookies(cookies).get(results_url) logger.debug "Response code: HTTP " + response3.code.to_s doc = Nokogiri::HTML(response3.to_s) else logger.error "Didn't get redirected from search." raise RuntimeError.new("Northgate: didn't get redirected from search.") end rows = doc.search("table.display_table tr") logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row # Iterate over search results rows.each do |row| if row.at("td") # skip header row which only has th's cells = row.search("td") app = Application.new app.scraped_at = Time.now app.council_reference = cells[0].inner_text.strip app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip) app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? app.address = cells[1].inner_text.strip app.description = cells[2].inner_text.strip app.status = cells[3].inner_text.strip raw_date_validated = cells[4].inner_text.strip app.date_validated = Date.parse(raw_date_validated) if raw_date_validated != '--' app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney apps << app end end apps end