class DwcaHunter::ResourceWikispecies
Wikispecies source
Public Class Methods
new(opts = { download: true, unpack: true })
click to toggle source
Calls superclass method
DwcaHunter::Resource::new
# File lib/dwca_hunter/resources/wikispecies.rb, line 6 def initialize(opts = { download: true, unpack: true }) @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies") @problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8") @command = "wikispecies" @title = "Wikispecies" @url = "http://dumps.wikimedia.org/specieswiki/latest/" \ "specieswiki-latest-pages-articles.xml.bz2" @url = opts[:url] if opts[:url] @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd" @download_path = File.join(@wikisp_path, "data.xml.bz2") @data = [] @templates = {} @taxon_ids = {} @tree = {} @paths = {} @extensions = [] @parser = Biodiversity::Parser @re = { page_start: /^\s*<page>\s*$/, page_end: %r{^\s*</page>\s*$}, template: /Template:/i, template_link: /\{\{([^}]*)\}\}/, vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i } super(opts) end
Public Instance Methods
download()
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 33 def download puts "Downloading from the source" `curl -L #{@url} -o #{@download_path}` end
make_dwca()
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 42 def make_dwca enrich_data generate_dwca end
unpack()
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 38 def unpack unpack_bz2 end
Private Instance Methods
enrich_data()
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 49 def enrich_data DwcaHunter.logger_write(object_id, "Extracting data from xml file...") Dir.chdir(@download_dir) f = open("data.xml", "r:utf-8") page_on = false page = "" page_num = 0 f.each do |l| if l.match(@re[:page_start]) page += l page_on = true elsif page_on page += l if l.match(@re[:page_end]) page_on = false page_xml = Nokogiri::XML.parse(page) if template?(page_xml) process_template(page_xml) else process_species(page_xml) end page_num += 1 if (page_num % BATCH_SIZE).zero? DwcaHunter.logger_write(object_id, "Traversed #{page_num} pages") end page = "" @page_title = nil @page_id = nil end end end DwcaHunter.logger_write(object_id, "Extracted total %s pages" % page_num) f.close end
find_species_components(x)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 178 def find_species_components(x) items = get_items(x.xpath("//text").text) is_taxon_item = items.key?("{{int:name}}") && items.key?("{{int:taxonavigation}}") return nil unless is_taxon_item items end
generate_dwca()
click to toggle source
Calls superclass method
DwcaHunter::Resource#generate_dwca
# File lib/dwca_hunter/resources/wikispecies.rb, line 242 def generate_dwca DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file") @core = [ ["http://rs.tdwg.org/dwc/terms/taxonID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://globalnames.org/terms/canonicalForm", "http://purl.org/dc/terms/source"] ] DwcaHunter.logger_write(object_id, "Assembling Core Data") count = 0 @data.map do |d| count += 1 if (count % BATCH_SIZE).zero? DwcaHunter.logger_write(object_id, "Traversing %s core data record" % count) end taxon_id = begin (if d[:classificationPath].empty? d[:taxonId] else @templates[d[:classificationPath]. last][:id] end) rescue StandardError d[:taxonId] end @taxon_ids[d[:taxonId]] = taxon_id parentNameUsageId = begin (@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1) rescue StandardError nil end url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}" path = d[:classificationPath] path.pop if path[-1] == d[:canonicalForm] canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip scientific_name = if d[:scientificName] == d[:canonicalForm] canonical_form else d[:scientificName] end @core << [taxon_id, scientific_name, canonical_form, url] end @extensions << { data: [[ "http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/vernacularName", "http://purl.org/dc/terms/language" ]], file_name: "vernacular_names.txt" } DwcaHunter.logger_write(object_id, "Creating verncaular name extension for DarwinCore Archive file") count = 0 @data.each do |d| count += 1 if (count % BATCH_SIZE).zero? DwcaHunter.logger_write(object_id, "Traversing %s extension data record" % count) end d[:vernacularNames].each do |vn| taxon_id = @taxon_ids[d[:taxonId]] || nil @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id end end @eml = { id: @uuid, title: @title, license: "http://creativecommons.org/licenses/by-sa/3.0/", authors: [ { first_name: "Stephen", last_name: "Thorpe", email: "stephen_thorpe@yahoo.co.nz", url: "http://species.wikimedia.org/wiki/Main_Page" } ], abstract: "The free species directory that anyone can edit.", metadata_providers: [ { first_name: "Dmitry", last_name: "Mozzherin", email: "dmozzherin@mbl.edu" } ], url: "http://species.wikimedia.org/wiki/Main_Page" } super end
get_full_scientific_name(items)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 125 def get_full_scientific_name(items) name_ary = items["{{int:name}}"] if name_ary.nil? || name_ary.empty? @problems_file.write("%s\n" % @data[-1][:canonicalForm]) return end name = name_ary[0] name = parse_name(name, @data[-1]) if name != "" @data[-1][:scientificName] = name end end
get_items(txt)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 187 def get_items(txt) item_on = false items = {} current_item = nil txt.split("\n").each do |l| item = l.match(/=+([^=]+)=+/) if item current_item = item[1].strip.downcase items[current_item] = [] elsif current_item && !l.empty? items[current_item] << l end end items end
get_vernacular_names(items)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 140 def get_vernacular_names(items) vern = items["{{int:vernacular names}}"] if vern.is_a?(Array) && vern.size.positive? vn_string = vern.join("") vn = vn_string.match(@re[:vernacular_names]) if vn vn_list = vn[1].strip.split("|") vnames = [] vn_list.each do |item| language, name = item.split("=").map(&:strip) next unless language && name && language.size < 4 && name.valid_encoding? vnames << { name: name, language: language } end @data[-1][:vernacularNames] = vnames end end end
init_classification_path(items)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 163 def init_classification_path(items) # ignore non-template links items["taxonavigation"]&.each do |line| line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links next unless template_link = line.match(@re[:template_link]) template_link = template_link[1]. strip.gsub(/Template:/, "").gsub(/_/, " ") unless template_link.match(/\|/) @data[-1][:classificationPath] << template_link break end end end
page_id(x)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 207 def page_id(x) @page_id ||= x.xpath("//id").first.text end
page_title(x)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 203 def page_title(x) @page_title ||= x.xpath("//title").first.text end
parse_name(name_string, taxa)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 215 def parse_name(name_string, taxa) name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm]) name_string = name_string.strip old_l = name_string.dup name_string.gsub!(/^\*\s*/, "") name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2') name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2') name_string.gsub!(/'{2,}/, " ") name_string.gsub!(/"{2,}/, " ") name_string.gsub!(/:\s*\d.*$/, "") name_string.gsub!(/,\s*\[RSD\]/i, "") name_string.gsub!(/^\s*†\s*/, "") name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "") # name_string = DwcaHunter::XML.unescape(name_string) name_string.gsub!(/<nowiki>.*$/, "") name_string.gsub!(%r{<br\s*/?\s*>}, "") name_string.gsub!(/^\s*†\s*/, "") name_string.gsub!(/ /, " ") name_string.gsub!(/\s+/, " ") res = name_string.strip parsed = @parser.parse(res, simple: true) if !["1","2"].include?(parsed[:quality]) return "" end res end
process_species(x)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 108 def process_species(x) return if page_title(x).match(/Wikispecies/i) items = find_species_components(x) if items @data << { taxonId: page_id(x), canonicalForm: page_title(x), scientificName: page_title(x), classificationPath: [], vernacularNames: [] } get_full_scientific_name(items) get_vernacular_names(items) end end
process_template(x)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 87 def process_template(x) name = page_title(x).gsub!(@re[:template], "").strip text = x.xpath("//text").text.strip parent_name = text.match(@re[:template_link]) if parent_name return if parent_name[1].match(/\#if/) list = parent_name[1].split("|") parent_name = if list.size == 1 list[0] elsif list[0].match(/Taxonav/i) list[1] else list[0] end end name.gsub!(/_/, " ") parent_name&.gsub!(/_/, " ") @templates[name] = { parentName: parent_name, id: page_id(x) } end
template?(page_xml)
click to toggle source
# File lib/dwca_hunter/resources/wikispecies.rb, line 211 def template?(page_xml) !!page_title(page_xml).match(@re[:template]) end