module Scraper

Constants

ADVISEMENT_TO_GRAD_YEAR
AGENT

Constants

COURSE_TITLE_IGNORES
GRADES
IGNORE_TITLES
SINGULAR_GRADES
SUCCESSFUL_LOGIN_PAGE_TITLE

Public Instance Methods

extract_course(id) click to toggle source
# File lib/lector/scraper.rb, line 96
def extract_course(id)
  page = get_profile(id, :course)

  parts = page.title.split(':')
  is_class = parts.length > 2 # Classes have a teacher, title would have ': Teacher Name'

  # Course: Theater Production: Grunner
  title = (parts.length == 1 ? parts[0]  : parts[1]).strip

  raise "#{id}: Ignoring 'course' #{title}" if COURSE_TITLE_IGNORES.any? { |ignore| title.end_with?(ignore) || title.start_with?(ignore) }

  returning = {
    type: :course,
    id: id,
    title: title,
    is_class: is_class
  }

  if is_class
    # Find teacher
    teacher_page = AGENT.get("http://moodle.regis.org/user/index.php?roleid=3&sifirst=&silast=&id=#{id.to_s}")
    returning[:teacher_id] = teacher_page.search("//strong/a[contains(@href, 'moodle.regis.org')]")[0]['href'].split("?id=")[1].split("&course=")[0].to_i
  end

  puts "#{id}: Course #{parts[1..-1].join}#{" (class)" if is_class}"

  returning
end
extract_person(id) click to toggle source
# File lib/lector/scraper.rb, line 42
def extract_person(id)
  page = get_profile(id, :person)
  
  title = page.title

  name = title.split(':')[0].split(' ')
  first_name = name[0]
  last_name = name[1..-1].join(' ')

  picture_url = nil
  begin
    picture_url = page.search("a/img[@alt=\"Picture of #{first_name} #{last_name}\"]")[0]['src']
  rescue => e
    puts e
    raise InvalidPageError, 'Page doesn\'t have picture.'
  end

  # DEPARTMENT (advisement for students, subject for staff)
  department = page.search("//dd[../dt = 'Department']/text()").to_s

  type = (/\A\d+\z/.match(department[0]) || department.empty? ? :student : :staff)

  username_guess = (type == :staff ? "#{first_name[0]}#{last_name}" : "#{first_name[0]}#{last_name}#{Lector::ADVISEMENT_TO_GRAD_YEAR[department[0]]}").downcase.gsub("'", '').gsub('-', '').gsub(' ', '')

  returning = {
    id: id,
    type: type,
    first_name: first_name,
    last_name: last_name,
    email: username_guess + '@regis.org',
    department: department,
    pictureurl: picture_url,
    username: username_guess,
    course_ids: page.search("//dd/ul/li/a[contains(@href, 'http://moodle.regis.org/course/view.php?id=')]").map { |link| link["href"].split("id=")[1].split("&")[0] }
  }

  if type == :student
    # FIND INFO FROM VERACROSS
    info = find_by_username_or_email(username_guess)
    raise InvalidPageError, 'Student doesn\'t exist. Failed to link to Veracross data.' if info.nil?

    returning[:advisement] = returning.delete(:department)
    returning[:veracross_id] = info['person_pk']
    returning[:pictureurl] = info['photo_url']
    returning[:graduation_year] = info['graduation_year']
    returning[:address] = info['resident_address'].sub('<br />', ' ')
    returning[:birthday] = Date.parse(info['birthday'])
  end

  puts "#{id}: #{type.capitalize} #{last_name}, #{first_name} of #{department} (#{username_guess})"

  returning
end
get_profile(id, type) click to toggle source
# File lib/lector/scraper.rb, line 29
def get_profile(id, type)
  base_url = (type == :person ? 'http://moodle.regis.org/user/profile.php?id=' : 'http://moodle.regis.org/course/view.php?id=')

  # Request page
  page = AGENT.get(base_url + id.to_s)

  # Discard unneccessary pages
  raise InvalidPageError, "Invalid page: '#{page.title}'" if page.title.nil? || IGNORE_TITLES.any? { |w| page.title.downcase.include?(w.downcase) }
  raise InvalidPageError, "Name is too short: '#{page.title}'" if page.title.split(':').length < 2

  page
end
login_to_moodle() click to toggle source
# File lib/lector/scraper.rb, line 16
def login_to_moodle
  puts "Attempting to login to Moodle as #{@config[:regis_username]}..."
  puts AGENT
  page = AGENT.post('https://moodle.regis.org/login/index.php', {
    username: @config[:regis_username],
    password: @config[:regis_password],
  })

  raise InvalidCredentialsError, 'Invalid Regis credentials! Couldn\'t log into Moodle.' unless page.title == SUCCESSFUL_LOGIN_PAGE_TITLE

  true
end