class RayyanScrapers::EntrezScraper
Constants
- DEFAULT_EXTRACTION_FIELDS
Attributes
query[R]
Public Instance Methods
create_search_url(page)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 50 def create_search_url(page) retmax = self.class.results_per_page retstart = (page - 1) * retmax "#{@search_url}&term=#{@query}&retstart=#{retstart}&retmax=#{retmax}&usehistory=y" end
escape_keyword(keyword)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 46 def escape_keyword(keyword) "(" + URI.escape(keyword) + ")" end
extract_abstracts(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 269 def extract_abstracts(xml, mArticle) mArticle.abstracts = (xml/'./Abstract/AbstractText').to_enum.map do |abstract| { label: abstract['Label'], category: abstract['NlmCategory'], content: abstract.text } end end
extract_affiliation(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 251 def extract_affiliation(xml, mArticle) mArticle.affiliation = extract_xpath_text xml, './Affiliation' end
extract_article_idtypes(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 288 def extract_article_idtypes(xml, mArticle) extract_idtypes xml, mArticle, "./#{@xml_element_data}/ArticleIdList/ArticleId" end
extract_article_title(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 239 def extract_article_title(xml, mArticle) mArticle.title = extract_xpath_text xml, './ArticleTitle' end
extract_book_idtypes(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 292 def extract_book_idtypes(xml, mArticle) extract_idtypes xml, mArticle, "./#{@xml_element_bookdata}/ArticleIdList/ArticleId" \ " | ./BookDocument/ArticleIdList/ArticleId" end
extract_book_pubtypes(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 311 def extract_book_pubtypes(xml, mArticle) mArticle.publication_types = ["Book"] end
extract_book_title(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 243 def extract_book_title(xml, mArticle) mArticle.title = extract_xpath_text xml, './BookTitle' end
extract_collection(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 264 def extract_collection(xml, mArticle) mArticle.collection = extract_xpath_text xml, './CollectionTitle' mArticle.collection_code = extract_xpath_text xml, './CollectionTitle/@book' end
extract_copyright(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 247 def extract_copyright(xml, mArticle) mArticle.copyright = extract_xpath_text xml, './Abstract/CopyrightInformation' end
extract_date(xml, xpath, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 356 def extract_date(xml, xpath, mArticle) year = extract_xpath_text xml, "#{xpath}/Year" month = extract_xpath_text xml, "#{xpath}/Month" day = extract_xpath_text xml, "#{xpath}/Day" mArticle.date_array = [year, month, day] end
extract_idtypes(xml, mArticle, xpath)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 297 def extract_idtypes(xml, mArticle, xpath) mArticle.article_ids = (xml/xpath).to_enum.map do |id| idtype = id['IdType'] idtype = @db_idtype if idtype == @xml_idtype value = id.text {idtype: idtype, value: value} end end
extract_journal_info(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 336 def extract_journal_info(xml, mArticle) journal = xml.at './Journal' mArticle.journal_title = extract_xpath_text journal, './Title' mArticle.journal_issn = extract_xpath_text journal, './ISSN' mArticle.journal_abbreviation = extract_xpath_text journal, './ISOAbbreviation' mArticle.jvolume = extract_xpath_text(journal, './JournalIssue/Volume').to_i mArticle.jissue = extract_xpath_text(journal, './JournalIssue/Issue').to_i mArticle.pagination = extract_xpath_text xml, './Pagination/MedlinePgn' jdate = extract_date journal, './JournalIssue/PubDate', mArticle if jdate.compact.empty? jdate = extract_xpath_text journal, './JournalIssue/PubDate/MedlineDate' year = jdate.split.first month = jdate.split.last.split('-').first mArticle.date_array = [year, month] end end
extract_language(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 255 def extract_language(xml, mArticle) mArticle.language = extract_xpath_text xml, './Language' end
extract_mesh(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 315 def extract_mesh(xml, mArticle) mArticle.keywords = (xml/"./#{@xml_element_citation}/MeshHeadingList//DescriptorName" \ " | ./#{@xml_element_citation}/KeywordList/Keyword").to_enum.map(&:text) end
extract_publisher(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 259 def extract_publisher(xml, mArticle) mArticle.publisher_name = extract_xpath_text xml, './Publisher/PublisherName' mArticle.publisher_location = extract_xpath_text xml, './Publisher/PublisherLocation' end
extract_pubtypes(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 307 def extract_pubtypes(xml, mArticle) mArticle.publication_types = (xml/'./PublicationTypeList/PublicationType').to_enum.map(&:text) end
extract_sections(xml, mArticle)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 320 def extract_sections(xml, mArticle) mArticle.sections = (xml/'./Sections/Section').to_enum.map do |section| sloc = section.at './LocationLabel' sloc = "#{sloc['Type']}:#{sloc.text}" unless sloc.nil? stitle = section.at './SectionTitle' spart = stitle['part'] stitle = stitle.text { code: spart, location: sloc, title: stitle } end end
extract_xpath_text(xml, xpath)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 234 def extract_xpath_text(xml, xpath) text = (xml/xpath).text text.present? ? text : nil end
fake_process_detail_page(pmid)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 105 def fake_process_detail_page(pmid) mArticle = RayyanFormats::Target.new mArticle.sid = pmid mArticle.url = "#{@detail_friendly_url}/#{pmid}" mArticle end
fetch_and_parse_article_list(list, extraction_fields = nil) { |article| ... }
click to toggle source
not test covered
# File lib/rayyan-scrapers/entrez_scraper.rb, line 182 def fetch_and_parse_article_list(list, extraction_fields = nil) # list should be an array of objects of format {pmid: pmid, article: RayyanFormats::Target} @hercules_articles.fight(list) do |article| fetch_and_parse_detail_page(article.sid, article, extraction_fields) do |article| yield article if block_given? end end end
fetch_and_parse_detail_page(pmid, mArticle, extraction_fields = nil) { |nil| ... }
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 112 def fetch_and_parse_detail_page(pmid, mArticle, extraction_fields = nil) @logger.debug "Requesting detail page as #{pmid}" link = "#{@detail_url}&id=#{pmid}" @hercules_articles.strike(link, "entrez-#{pmid}", true) do |request, response| if response.class == Exception yield nil if block_given? else @logger.debug "Processing detail page as #{pmid}" xml = Nokogiri::XML.parse(response, link) root = xml.at "/#{@xml_element_root}/#{@xml_element_root_article}" unless root.nil? process_article_detail_page(root, mArticle, extraction_fields) yield mArticle if block_given? else root = xml.at "/#{@xml_element_root}/#{@xml_element_root_book}" unless root.nil? process_book_detail_page(root, mArticle, extraction_fields) yield mArticle if block_given? else @logger.warn "Unknown XML format for PMID #{pmid} with url #{link}" yield nil if block_given? end # unless book end # unless article end # if exception end # strike end
fetch_and_parse_pmid_list(list, extraction_fields = nil) { |article| ... }
click to toggle source
not test covered
# File lib/rayyan-scrapers/entrez_scraper.rb, line 192 def fetch_and_parse_pmid_list(list, extraction_fields = nil) @hercules_articles.fight(list) do |pmid| process_detail_page(pmid, extraction_fields) do |article| yield article if block_given? end end end
get_next_page_link(page, page_id)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 74 def get_next_page_link(page, page_id) begin create_search_url page_id rescue nil end end
get_start_page()
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 56 def get_start_page url = create_search_url(1) page = Typhoeus::Request.get(url, @headers) Nokogiri::XML.parse(page.body, URI.escape(url)) end
parse_search_results(string, extraction_fields = nil) { |mArticle, total| ... }
click to toggle source
user upload xml
# File lib/rayyan-scrapers/entrez_scraper.rb, line 201 def parse_search_results(string, extraction_fields = nil) xml = Nokogiri::XML.parse(string, "file:///rawfile.xml") items = xml/"/#{@xml_element_root}/*" total = items.length @logger.debug("Found #{total} articles in input pubmed file") items.each do |item| begin mArticle = RayyanFormats::Target.new failed = false case item.node_name when @xml_element_root_article process_article_detail_page(item, mArticle, extraction_fields) when @xml_element_root_book process_book_detail_page(item, mArticle, extraction_fields) else @logger.warn "Unknown XML format for search result of type #{item.node_name}" failed = true end unless failed pmid = ScraperBase.node_text item, './/PMID' mArticle.sid = pmid mArticle.url = "#{@detail_friendly_url}#{pmid}" yield mArticle, total if block_given? end # unless failed rescue => exception @logger.error "Error processing item in search result of type #{item.node_name} [#{exception}] " + "caused by #{exception.backtrace.first}" end # process item rescue end # items.each total end
process_article_detail_page(xml, mArticle, extraction_fields = nil)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 140 def process_article_detail_page(xml, mArticle, extraction_fields = nil) # Example: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=23185113 extraction_fields = DEFAULT_EXTRACTION_FIELDS if extraction_fields.nil? article_xml = xml/"./#{@xml_element_citation}/Article" extract_article_title article_xml, mArticle if extraction_fields[:title] extract_copyright article_xml, mArticle if extraction_fields[:copyright] extract_affiliation article_xml, mArticle if extraction_fields[:affiliation] extract_language article_xml, mArticle if extraction_fields[:language] extract_pubtypes article_xml, mArticle if extraction_fields[:pubtypes] extract_abstracts article_xml, mArticle if extraction_fields[:abstracts] extract_authors article_xml, mArticle if extraction_fields[:authors] extract_journal_info article_xml, mArticle if extraction_fields[:journal] extract_article_idtypes xml, mArticle if extraction_fields[:idtypes] extract_mesh xml, mArticle if extraction_fields[:keyphrases] # TODO: full text (link out) either from same document (id?) or from # another ELink request: http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink end
process_book_detail_page(xml, mArticle, extraction_fields = nil)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 161 def process_book_detail_page(xml, mArticle, extraction_fields = nil) # Example: http://www.ncbi.nlm.nih.gov/pubmed?term=22787640 extraction_fields = DEFAULT_EXTRACTION_FIELDS if extraction_fields.nil? bookdoc = xml/'./BookDocument' book = bookdoc/'./Book' extract_book_title book, mArticle if extraction_fields[:title] extract_authors book, mArticle if extraction_fields[:authors] extract_publisher book, mArticle if extraction_fields[:publisher] extract_collection book, mArticle if extraction_fields[:collection] extract_date book, './PubDate', mArticle if extraction_fields[:dates] extract_book_pubtypes book, mArticle if extraction_fields[:pubtypes] extract_copyright bookdoc, mArticle if extraction_fields[:copyright] extract_language bookdoc, mArticle if extraction_fields[:language] extract_abstracts bookdoc, mArticle if extraction_fields[:abstracts] extract_sections bookdoc, mArticle if extraction_fields[:sections] extract_book_idtypes xml, mArticle if extraction_fields[:idtypes] end
process_detail_page(pmid, extraction_fields = nil, &block)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 98 def process_detail_page(pmid, extraction_fields = nil, &block) mArticle = RayyanFormats::Target.new mArticle.sid = pmid mArticle.url = "#{@detail_friendly_url}#{pmid}" fetch_and_parse_detail_page(pmid, mArticle, extraction_fields, &block) end
process_list_page(page, &block)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 82 def process_list_page(page, &block) page = Nokogiri::XML(page.body) unless page.is_a?(Nokogiri::XML::Document) @logger.info("Processing list page") items = page/'/eSearchResult/IdList/Id' @logger.info "Found #{items.length} items in page" @hercules_articles.fight(items) do |id| pmid = id.text @logger.info "Got result with id #{pmid}" # get detailed info process_detail_page(pmid, &block) end # end fight items.length end
search(query, &block)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 22 def search(query, &block) @query = if query.instance_of?(String) escape_keyword query elsif query.instance_of?(Array) query.map do |topic_query| if topic_query.instance_of?(String) escape_keyword topic_query elsif topic_query.instance_of?(Array) "(" + topic_query.map do |keyword| escape_keyword keyword end.join("+OR+") + ")" else raise 'Array elements must be either Strings or Arrays' end end.join("+AND+") else raise 'query must be either String or Array' end @logger.debug "Entrez scraper initialized with query #{@query}" scrape &block end
total_pages(page)
click to toggle source
# File lib/rayyan-scrapers/entrez_scraper.rb, line 62 def total_pages(page) begin results_text = ScraperBase.node_text(page, '/eSearchResult/Count/text()') n = results_text.to_i raise 'Zero total' if n == 0 @logger.info("Found total of #{n} results") n rescue 'Unknown' end end