class AnyStyle::Finder

Public Class Methods

new(options = {}) click to toggle source
Calls superclass method
   # File lib/anystyle/finder.rb
17 def initialize(options = {})
18   super(options)
19 
20   @features = [
21     Feature::Line.new,
22     Feature::Category.new(strip: true),
23     Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance),
24     Feature::Indent.new,
25     Feature::Ref.new,
26     Feature::Position.new(seq: :page, idx: :ln),
27     Feature::Position.new(seq: :pages, idx: :pn)
28   ]
29 end

Public Instance Methods

expand(dataset) click to toggle source
   # File lib/anystyle/finder.rb
31 def expand(dataset)
32   dataset.each do |doc|
33     doc.each.with_index do |(line, ln, page, pn), idx|
34       line.observations = features.map.with_index { |f, fn|
35         f.observe line.value,
36           page: page,
37           pages: doc.pages,
38           seq: doc,
39           pn: pn,
40           ln: ln,
41           fn: fn,
42           idx: idx
43       }.flatten
44     end
45   end
46 end
find(input, format: options[:format], **opts) click to toggle source
   # File lib/anystyle/finder.rb
48 def find(input, format: options[:format], **opts)
49   case format.to_sym
50   when :references, :ref
51     format_references(label(input, **opts), **opts)
52   when :hash
53     format_hash(label(input, **opts), **opts)
54   when :wapiti
55     label(input, **opts)
56   else
57     raise ArgumentError, "unknown format '#{format}'"
58   end
59 end
format_hash(dataset, **opts) click to toggle source
   # File lib/anystyle/finder.rb
61 def format_hash(dataset, **opts)
62   dataset.map { |doc| doc.to_h(**opts) }
63 end
format_references(dataset, **opts) click to toggle source
   # File lib/anystyle/finder.rb
65 def format_references(dataset, **opts)
66   dataset.map { |doc| doc.references(**opts) }
67 end
label(input, layout: true, crop: false, **opts) click to toggle source
   # File lib/anystyle/finder.rb
69 def label(input, layout: true, crop: false, **opts)
70   dataset = prepare(input, layout: layout, crop: crop, **opts)
71   output = model.label(dataset, **opts)
72   Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
73     doc.label(output[idx])
74   })
75 end
prepare(input, layout: options[:layout], crop: false, pdftotext: options[:pdftotext], pdfinfo: options[:pdfinfo], **opts) click to toggle source
Calls superclass method
   # File lib/anystyle/finder.rb
77 def prepare(input,
78             layout: options[:layout],
79             crop: false,
80             pdftotext: options[:pdftotext],
81             pdfinfo: options[:pdfinfo],
82             **opts)
83   doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts }
84   case input
85   when String
86     super(Document.open(input, **doc_opts), **opts)
87   when Array
88     super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts)
89   else
90     super(input, **opts)
91   end
92 end
save_each(dataset, dir: '.', tagged: false, **opts) click to toggle source
    # File lib/anystyle/finder.rb
 94 def save_each(dataset, dir: '.', tagged: false, **opts)
 95   dataset.each.with_index do |doc, idx|
 96     name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
 97     file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
 98     File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
 99   end
100 end