class UKPlanningScraper::Authority

Attributes

name[R]
url[R]

Public Class Methods

all() click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 84
def self.all
  @@authorities
end
load() click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 122
def self.load
  # Don't run this method more than once
  return unless @@authorities.empty?
  CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
      'authorities.csv'), :headers => true) do |line|
    auth = Authority.new(line['authority_name'], line['url'])
    
    if line['tags']
      auth.add_tags(line['tags'].split(/\s+/))
    end
    
    auth.add_tag(auth.system)
    @@authorities << auth
  end
end
named(name) click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 95
def self.named(name)
  authority = @@authorities.find { |a| name == a.name }
  raise AuthorityNotFound if authority.nil?
  authority 
end
new(name, url) click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 9
def initialize(name, url)
  @name = name.strip
  @url = url.strip
  @tags = [] # Strings in arbitrary order
  @applications = [] # Application objects
  @scrape_params = {}
end
not_tagged(tag) click to toggle source

Not tagged x

# File lib/uk_planning_scraper/authority.rb, line 109
def self.not_tagged(tag)
  found = []
  @@authorities.each { |a| found << a unless a.tagged?(tag) }
  found
end
tagged(tag) click to toggle source

Tagged x

# File lib/uk_planning_scraper/authority.rb, line 102
def self.tagged(tag)
  found = []
  @@authorities.each { |a| found << a if a.tagged?(tag) }
  found
end
tags() click to toggle source

List all the tags in use

# File lib/uk_planning_scraper/authority.rb, line 89
def self.tags
  tags = []
  @@authorities.each { |a| tags << a.tags }
  tags.flatten.uniq.sort
end
untagged() click to toggle source

Authorities with no tags

# File lib/uk_planning_scraper/authority.rb, line 116
def self.untagged
  found = []
  @@authorities.each { |a| found << a if a.tags.empty? }
  found
end

Public Instance Methods

add_tag(tag) click to toggle source

Add a single tag to existing tags

# File lib/uk_planning_scraper/authority.rb, line 61
def add_tag(tag)
  clean_tag = tag.strip.downcase.gsub(' ', '')
  @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
end
add_tags(tags) click to toggle source

Add multiple tags to existing tags

# File lib/uk_planning_scraper/authority.rb, line 56
def add_tags(tags)
  tags.each { |t| add_tag(t) }
end
applicant_name(s) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 54
def applicant_name(s)
  unless system == 'idox'
    raise NoMethodError.new("applicant_name is only implemented for Idox. \
      This authority (#{@name}) is #{system.capitalize}.")
  end
  
  check_class(s, String)
  @scrape_params[:applicant_name] = s.strip
  self
end
application_type(s) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 76
def application_type(s)
  unless system == 'idox'
    raise NoMethodError.new("application_type is only implemented for \
      Idox. This authority (#{@name}) is #{system.capitalize}.")
  end
  
  check_class(s, String)
  @scrape_params[:application_type] = s.strip
  self
end
case_officer_code(s) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 65
def case_officer_code(s)
  unless system == 'northgate'
    raise NoMethodError.new("case_officer_code is only implemented for Northgate. \
      This authority (#{@name}) is #{system.capitalize}.")
  end
  
  check_class(s, String)
  @scrape_params[:case_officer_code] = s.strip
  self
end
decided_days(n) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 40
def decided_days(n)
  # decided within the last n days
  # Assumes that every scraper/system can do a date range search
  check_class(n, Fixnum)

  unless n > 0
    raise ArgumentError.new("decided_days must be greater than 0")
  end
  
  decided_from(Date.today - (n - 1))
  decided_to(Date.today)
  self
end
development_type(s) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 87
def development_type(s)
  unless system == 'idox'
    raise NoMethodError.new("development_type is only implemented for \
      Idox. This authority (#{@name}) is #{system.capitalize}.")
  end
  
  check_class(s, String)
  @scrape_params[:development_type] = s.strip
  self
end
received_days(n) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 26
def received_days(n)
  # received within the last n days
  # Assumes that every scraper/system can do a date range search
  check_class(n, Fixnum)

  unless n > 0
    raise ArgumentError.new("received_days must be greater than 0")
  end
  
  received_from(Date.today - (n - 1))
  received_to(Date.today)
  self
end
scrape(options = {}) click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 17
def scrape(options = {})
  default_options = {
    delay: 10,
  }
  # The user-supplied options override the defaults
  options = default_options.merge(options)

  # Select which scraper to use
  case system
  when 'idox'
    @applications = scrape_idox(@scrape_params, options)
  when 'northgate'
    @applications = scrape_northgate(@scrape_params, options)
  else
    raise SystemNotSupported.new("Planning system not supported for \
      #{@name} at URL: #{@url}")
  end
  
  # Post processing
  @applications.each do |app|
    app.authority_name = @name
  end

  # Output as an array of hashes
  output = []
  # FIXME - silently ignores invalid apps. How should we handle them?
  @applications.each { |app| output << app.to_hash if app.valid? }
  
  # Reset so that old params don't get used for new scrapes
  clear_scrape_params
  
  output  # Single point of successful exit
end
status(s) click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 98
def status(s)
  check_class(s, String)
  @scrape_params[:status] = s.strip
  self
end
system() click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 70
def system
  if @url.match(/search\.do\?action=advanced/i)
    'idox'
  elsif @url.match(/\.aspx/i)
    'northgate'
  elsif @url.match(/ocellaweb/i)
    'ocellaweb'
  elsif @url.match(/\/apas\//)
    'agileplanning'
  else
    'unknownsystem'
  end
end
tagged?(tag) click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 66
def tagged?(tag)
  @tags.include?(tag)
end
tags() click to toggle source
# File lib/uk_planning_scraper/authority.rb, line 51
def tags
  @tags.sort
end
validated_days(n) click to toggle source

Parameter methods for Authority#scrape Desgined to be method chained, eg:

applications = UKPlanningScraper::Authority.named(“Barnet”). \ development_type(“Q22”).keywords(“illuminat”). \ validated_days(30).scrape

# File lib/uk_planning_scraper/authority_scrape_params.rb, line 12
def validated_days(n)
  # Validated within the last n days
  # Assumes that every scraper/system can do a date range search
  check_class(n, Fixnum)

  unless n > 0
    raise ArgumentError.new("validated_days must be greater than 0")
  end
  
  validated_from(Date.today - (n - 1))
  validated_to(Date.today)
  self
end

Private Instance Methods

check_class( param_value, expected_class, param_name = caller_locations(1, 1)[0].label) click to toggle source

stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method

# File lib/uk_planning_scraper/authority_scrape_params.rb, line 141
def check_class(
  param_value,
  expected_class,
  param_name = caller_locations(1, 1)[0].label) # name of calling method
  unless param_value.class == expected_class
    raise TypeError.new("#{param_name} must be a " \
      "#{expected_class} not a #{param_value.class.to_s}")
  end
end
clear_scrape_params() click to toggle source
# File lib/uk_planning_scraper/authority_scrape_params.rb, line 136
def clear_scrape_params
  @scrape_params = {}
end
method_missing(method_name, *args) click to toggle source

Handle the simple params with this

# File lib/uk_planning_scraper/authority_scrape_params.rb, line 107
def method_missing(method_name, *args)
  sc_params = {
    validated_from: Date,
    validated_to: Date,
    received_from: Date,
    received_to: Date,
    decided_from: Date,
    decided_to: Date,
    keywords: String
  }
  
  value = args[0]
  
  if sc_params[method_name]
    check_class(value, sc_params[method_name], method_name.to_s)
    value.strip! if value.class == String
    
    if value.class == Date && value > Date.today
      raise ArgumentError.new("#{method_name} can't be a date in the " + \
        "future (#{value.to_s})")
    end
    
    @scrape_params[method_name] = value
    self
  else
    raise NoMethodError.new(method_name.to_s)
  end
end
scrape_idox(params, options) click to toggle source
# File lib/uk_planning_scraper/idox.rb, line 7
def scrape_idox(params, options)
  puts "Using Idox scraper."
  base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  
  apps = []

  agent = Mechanize.new
  puts "Getting: #{@url}"
  page = agent.get(@url) # load the search form page

  # Check that the search form is actually present.
  # When Idox has an internal error it returns an error page with HTTP 200.
  unless form = page.form('searchCriteriaForm')
    puts "Error: Search form page failed to load due to Idox internal error."
    return []
  end
  # form.action = form.action + '&searchCriteria.resultsPerPage=100'

  # Fill out and submit search form

  # Add expected fields to form if they're not already present so that searches using these terms work
  %w{
    date(applicationReceivedStart)
    date(applicationReceivedEnd)
  }.each { |f| form.add_field!(f) unless form.has_field?(f) }

  date_format = "%d/%m/%Y"
  
  form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
  form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]

  form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
  form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]

  form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
  form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]

  form.send(:"searchCriteria\.description", params[:keywords])
  form.send(:"searchCriteria\.caseStatus", params[:status])
  
  # Some councils don't have the applicant name on their form, eg Bexley
  form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  
  form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
  
  # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
  form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'

  page = form.submit

  if page.search('.errors').inner_text.match(/Too many results found/i)
    raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
  end
  
  loop do
    # Parse search results
    items = page.search('li.searchresult')

    puts "Found #{items.size} apps on this page."

    items.each do |app|
      data = Application.new

      # Parse info line
      info_line = app.at("p.metaInfo").inner_text.strip
      bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
      
      bits.each do |bit|
        if matches = bit.match(/Ref\. No:\s+(.+)/)
          data.council_reference = matches[1]
        end

        if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
          data.date_received = Date.parse(matches[2])
        end
        
        if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
          data.date_validated = Date.parse(matches[1])
        end

        if matches = bit.match(/Status:\s+(.+)/)
          data.status = matches[1]
        end
      end

      data.scraped_at = Time.now
      data.info_url = base_url + app.at('a')['href']
      data.address = app.at('p.address').inner_text.strip
      data.description = app.at('a').inner_text.strip
      
      apps << data
    end
    
    # Get the Next button from the pager, if there is one
    if next_button = page.at('a.next')
      next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
      sleep options[:delay]
      puts "Getting: #{next_url}"
      page = agent.get(next_url)
    else
      break
    end
  end
  
  # Scrape the summary tab for each app
  apps.each_with_index do |app, i|
    sleep options[:delay]
    puts "#{i + 1} of #{apps.size}: #{app.info_url}"
    res = agent.get(app.info_url)
    
    if res.code == '200' # That's a String not an Integer, ffs
      # Parse the summary tab for this app

      app.scraped_at = Time.now

      # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
      # Bradford has #tab_documents but without the document count on it
      app.documents_count = 0

      if documents_link = res.at('.associateddocument a')
        if documents_link.inner_text.match(/\d+/)
          app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
          app.documents_url = base_url + documents_link[:href]
        end
      elsif documents_link = res.at('#tab_documents')
        if documents_link.inner_text.match(/\d+/)
          app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
          app.documents_url = base_url + documents_link[:href]
        end
      end
      
      # We need to find values in the table by using the th labels.
      # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

      res.search('#simpleDetailsTable tr').each do |row|
        key = row.at('th').inner_text.strip
        value = row.at('td').inner_text.strip
        
        case key
          when 'Reference'
            app.council_reference = value
          when 'Alternative Reference'
            app.alternative_reference = value unless value.empty?
          when 'Planning Portal Reference'
            app.alternative_reference = value unless value.empty?
          when 'Application Received'
            app.date_received = Date.parse(value) if value.match(/\d/)
          when 'Application Registered'
            app.date_received = Date.parse(value) if value.match(/\d/)
          when 'Application Validated'
            app.date_validated = Date.parse(value) if value.match(/\d/)
          when 'Address'
            app.address = value unless value.empty?
          when 'Proposal'
            app.description = value unless value.empty?
          when 'Status'
            app.status = value unless value.empty?
          when 'Decision'
            app.decision = value unless value.empty?
          when 'Decision Issued Date'
            app.date_decision = Date.parse(value) if value.match(/\d/)
          when 'Appeal Status'
            app.appeal_status = value unless value.empty?
          when 'Appeal Decision'
            app.appeal_decision = value unless value.empty?
          else
            puts "Error: key '#{key}' not found"
        end # case
      end # each row
    else
      puts "Error: HTTP #{res.code}"
    end # if
  end # scrape summary tab for apps
  apps
end
scrape_northgate(params, options) click to toggle source
# File lib/uk_planning_scraper/northgate.rb, line 8
def scrape_northgate(params, options)
  puts "Using Northgate scraper."
  base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  
  # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
  generic_url = @url.match(/.+\//)[0] + 'Generic/'
  
  apps = []

  $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  logger = Logger.new($stdout)
  logger.level = Logger::DEBUG

  date_regex = /\d{2}-\d{2}-\d{4}/

  form_vars = {
    'csbtnSearch' => 'Search' # required
  }

  # Keywords
  form_vars['txtProposal'] = params[:keywords]

  # Date received from and to
  if params[:received_from] || params[:received_to]
    form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
    form_vars['rbGroup'] = 'rbRange'
    form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
    form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
  end

  # Date validated from and to
  if params[:validated_from] || params[:validated_to]
    form_vars['cboSelectDateValue'] = 'DATE_VALID'
    form_vars['rbGroup'] = 'rbRange'
    form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
    form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
  end

  # Date decided from and to
  if params[:decided_from] || params[:decided_to]
    form_vars['cboSelectDateValue'] = 'DATE_DECISION'
    form_vars['rbGroup'] = 'rbRange'
    form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
    form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
  end
  
  # Status
  if params[:status]
    form_vars['cboStatusCode'] = params[:status]
  end

  # Case officer code
  if params[:case_officer_code]
    form_vars['cboCaseOfficerCode'] = params[:case_officer_code]
    @url.sub!('GeneralSearch.aspx', 'CaseOfficerWorkloadSearch.aspx')
  end

  logger.info "Form variables: #{form_vars.to_s}"

  headers = {
    'Origin' => base_url,
    'Referer' => @url,
  }

  logger.debug "HTTP request headers:"
  logger.debug(headers.to_s)

  logger.debug "GET: " + @url
  response = HTTP.headers(headers).get(@url)
  logger.debug "Response code: HTTP " + response.code.to_s

  if response.code == 200
    doc = Nokogiri::HTML(response.to_s)
    asp_vars = {
      '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
      '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
     }
  else
    logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
    raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
  end

  cookies = {}
  response.cookies.each { |c| cookies[c.name] = c.value }

  form_vars.merge!(asp_vars)

  logger.debug "POST: " + @url
  response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
  logger.debug "Response code: HTTP " + response2.code.to_s

  if response2.code == 302
    # Follow the redirect manually
    # Set the page size (PS) to max so we don't have to page through search results
    logger.debug "Location: #{response2.headers['Location']}"
    results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
    logger.debug "GET: " + results_url
    response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
    logger.debug "Response code: HTTP " + response3.code.to_s
    doc = Nokogiri::HTML(response3.to_s)
  else
    logger.error "Didn't get redirected from search."
    raise RuntimeError.new("Northgate: didn't get redirected from search.")
  end

  rows = doc.search("table.display_table tr")
  logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row

  # Iterate over search results
  rows.each do |row|
    if row.at("td") # skip header row which only has th's
      cells = row.search("td")

      app = Application.new
      app.scraped_at = Time.now
      app.council_reference = cells[0].inner_text.strip
      app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
      app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
      app.address = cells[1].inner_text.strip
      app.description = cells[2].inner_text.strip
      app.status = cells[3].inner_text.strip
      raw_date_validated = cells[4].inner_text.strip
      app.date_validated = Date.parse(raw_date_validated) if raw_date_validated != '--'
      app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney

      apps << app
    end
  end
  apps
end