module Textminer

Textminer::Mined

Class to give back text mining object

Textminer::Miner

Class to give back text mining object

Constants

VERSION

Public Class Methods

extract(path) click to toggle source

Thin layer around pdf-reader gem's PDF::Reader

@param path [String] Path to a pdf file downloaded via {fetch}, or

another way.

This method is used internally within fetch to parse PDFs.

@example

require 'textminer'
res = Textminer.search(member: 2258, filter: {has_full_text: true});
links = res.links_pdf(true);
# Get full text for an article
out = Textminer.fetch(url: links[0]);
# extract pdf to text
Textminer.extract(out.path)
# File lib/textminer.rb, line 140
def self.extract(path)
  rr = PDF::Reader.new(path)
  rr.pages.map { |page| page.text }.join("\n")
end
fetch(url) click to toggle source

Get full text

Work easily for open access papers, but for closed. For non-OA papers, use Crossref's Text and Data Mining service, which requires authentication and pre-authorized IP address. Go to apps.crossref.org/clickthrough/researchers to sign up for the TDM service, to get your key. The only publishers taking part at this time are Elsevier and Wiley.

@param url [String] A url for full text @return [Mined] An object of class Mined, with methods for extracting the url requested, the file path, and parsing the plain text, XML, or extracting text from the pdf.

@example

require 'textminer'
# Set authorization
Textminer.configuration do |config|
  config.tdm_key = "<your key>"
end
# Get some elsevier works
res = Textminer.search(member: 78, filter: {has_full_text: true});
links = res.links_xml(true);
# Get full text for an article
out = Textminer.fetch(url: links[0]);
out.url
out.path
out.type
xml = out.parse()
puts xml
xml.xpath('//xocs:cover-date-text', xml.root.namespaces).text
# Get lots of articles
links = links[1..3]
out = links.collect{ |x| Textminer.fetch(url: x) }
out.collect{ |z| z.path }
out.collect{ |z| z.parse }
zz = out[0].parse
zz.xpath('//xocs:cover-date-text', zz.root.namespaces).text

## plain text
# get full text links, here doing xml
links = res.links_plain(true);
# Get full text for an article
res = Textminer.fetch(url: links[0]);
res.url
res.parse

# With open access content - using Pensoft
res = Textminer.search(member: 2258, filter: {has_full_text: true});
links = res.links_xml(true);
# Get full text for an article
res = Textminer.fetch(url: links[0]);
res.url
res.parse

# OA content - pdfs, using pensoft again
res = Textminer.search(member: 2258, filter: {has_full_text: true});
links = res.links_pdf(true);
# Get full text for an article
res = Textminer.fetch(url: links[0]);
# url used
res.url
# document type
res.type
# document path on your machine
res.path
# get text
res.parse
# File lib/textminer.rb, line 120
def self.fetch(url)
  Miner.new(url).perform
end

Protected Class Methods