class AnyStyle::Finder
Public Class Methods
new(options = {})
click to toggle source
Calls superclass method
# File lib/anystyle/finder.rb 17 def initialize(options = {}) 18 super(options) 19 20 @features = [ 21 Feature::Line.new, 22 Feature::Category.new(strip: true), 23 Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance), 24 Feature::Indent.new, 25 Feature::Ref.new, 26 Feature::Position.new(seq: :page, idx: :ln), 27 Feature::Position.new(seq: :pages, idx: :pn) 28 ] 29 end
Public Instance Methods
expand(dataset)
click to toggle source
# File lib/anystyle/finder.rb 31 def expand(dataset) 32 dataset.each do |doc| 33 doc.each.with_index do |(line, ln, page, pn), idx| 34 line.observations = features.map.with_index { |f, fn| 35 f.observe line.value, 36 page: page, 37 pages: doc.pages, 38 seq: doc, 39 pn: pn, 40 ln: ln, 41 fn: fn, 42 idx: idx 43 }.flatten 44 end 45 end 46 end
find(input, format: options[:format], **opts)
click to toggle source
# File lib/anystyle/finder.rb 48 def find(input, format: options[:format], **opts) 49 case format.to_sym 50 when :references, :ref 51 format_references(label(input, **opts), **opts) 52 when :hash 53 format_hash(label(input, **opts), **opts) 54 when :wapiti 55 label(input, **opts) 56 else 57 raise ArgumentError, "unknown format '#{format}'" 58 end 59 end
format_hash(dataset, **opts)
click to toggle source
# File lib/anystyle/finder.rb 61 def format_hash(dataset, **opts) 62 dataset.map { |doc| doc.to_h(**opts) } 63 end
format_references(dataset, **opts)
click to toggle source
# File lib/anystyle/finder.rb 65 def format_references(dataset, **opts) 66 dataset.map { |doc| doc.references(**opts) } 67 end
label(input, layout: true, crop: false, **opts)
click to toggle source
# File lib/anystyle/finder.rb 69 def label(input, layout: true, crop: false, **opts) 70 dataset = prepare(input, layout: layout, crop: crop, **opts) 71 output = model.label(dataset, **opts) 72 Wapiti::Dataset.new(dataset.map.with_index { |doc, idx| 73 doc.label(output[idx]) 74 }) 75 end
prepare(input, layout: options[:layout], crop: false, pdftotext: options[:pdftotext], pdfinfo: options[:pdfinfo], **opts)
click to toggle source
Calls superclass method
# File lib/anystyle/finder.rb 77 def prepare(input, 78 layout: options[:layout], 79 crop: false, 80 pdftotext: options[:pdftotext], 81 pdfinfo: options[:pdfinfo], 82 **opts) 83 doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts } 84 case input 85 when String 86 super(Document.open(input, **doc_opts), **opts) 87 when Array 88 super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts) 89 else 90 super(input, **opts) 91 end 92 end
save_each(dataset, dir: '.', tagged: false, **opts)
click to toggle source
# File lib/anystyle/finder.rb 94 def save_each(dataset, dir: '.', tagged: false, **opts) 95 dataset.each.with_index do |doc, idx| 96 name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path)) 97 file = "#{name}.#{tagged ? 'ttx' : 'txt'}" 98 File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts)) 99 end 100 end