class AnyStyle::Document

Constants

REFSECT

Attributes

info[RW]
lines[RW]
meta[RW]
pages[RW]
path[RW]
tokens[RW]

Public Class Methods

open(path, format: File.extname(path), tagged: false, **opts) click to toggle source
   # File lib/anystyle/document.rb
20 def open(path, format: File.extname(path), tagged: false, **opts)
21   raise ArgumentError,
22     "cannot open tainted path: '#{path}'" if path.tainted?
23   raise ArgumentError,
24     "document not found: '#{path}'" unless File.exist?(path)
25 
26   path = File.absolute_path(path)
27 
28   case format.downcase
29   when '.pdf'
30     meta = pdf_meta path, **opts if opts[:parse_meta]
31     info = pdf_info path, **opts if opts[:parse_info]
32     input = pdf_to_text path, **opts
33   when '.ttx'
34     tagged = true
35     input = File.read(path, encoding: 'utf-8')
36   when '.txt'
37     input = File.read(path, encoding: 'utf-8')
38   end
39 
40   doc = parse input, tagged: tagged
41   doc.path = path
42   doc.meta = meta
43   doc.info = info
44   doc
45 end
parse(string, delimiter: /\r?\n/, tagged: false) click to toggle source
   # File lib/anystyle/document.rb
 9 def parse(string, delimiter: /\r?\n/, tagged: false)
10   current_label = ''
11   new(string.split(delimiter).map { |line|
12     if tagged
13       label, line = line.split(/\s*\| /, 2)
14       current_label = label unless label.empty?
15     end
16     Wapiti::Token.new line, label: current_label.to_s
17   })
18 end

Public Instance Methods

each() { |line, ln, page, pn| ... } click to toggle source
   # File lib/anystyle/document.rb
65 def each
66   if block_given?
67     pages.each.with_index do |page, pn|
68       page.lines.each.with_index do |line, ln|
69         yield line, ln, page, pn
70       end
71     end
72     self
73   else
74     to_enum
75   end
76 end
each_section(skip: ['meta']) { |head, body| ... } click to toggle source
    # File lib/anystyle/document.rb
 78 def each_section(skip: ['meta'])
 79   if block_given?
 80     head = []
 81     body = []
 82     seen_content = false
 83 
 84     lines.each do |ln|
 85       case ln.label
 86       when 'title'
 87         if seen_content
 88           yield [head, body]
 89           head, body, seen_content = [ln], [], false
 90         else
 91           head << ln
 92         end
 93       when 'ref', 'text'
 94         body << ln
 95         seen_content = true
 96       else
 97         body << ln unless skip.include?(ln.label)
 98       end
 99     end
100     unless head.empty?
101       yield [head, body]
102     end
103     self
104   else
105     to_enum :each_section
106   end
107 end
include_references?(rc, tc) click to toggle source
    # File lib/anystyle/document.rb
169 def include_references?(rc, tc)
170   rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
171 end
inspect() click to toggle source
    # File lib/anystyle/document.rb
194 def inspect
195   "#<AnyStyle::Document lines={#{size}}>"
196 end
label(other) click to toggle source
    # File lib/anystyle/document.rb
109 def label(other)
110   doc = dup
111   doc.tokens = lines.map.with_index { |line, idx|
112     Wapiti::Token.new line.value,
113       label: other[idx].label.to_s,
114       observations: other[idx].observations.dup,
115       score: other[idx].score
116   }
117   doc
118 end
line_counts() click to toggle source
   # File lib/anystyle/document.rb
53 def line_counts
54   @line_counts ||= Hash.new(0)
55 end
nnum_counts() click to toggle source
   # File lib/anystyle/document.rb
57 def nnum_counts
58   @nnum_counts ||= Hash.new(0)
59 end
references(normalize_blocks: false, **opts) click to toggle source
    # File lib/anystyle/document.rb
147 def references(normalize_blocks: false, **opts)
148   if normalize_blocks
149     each_section.inject([]) do |refs, (head, body)|
150       rc = body.count { |tk| tk.label == 'ref' }
151       unless rc == 0
152         tc = body.count { |tk| tk.label == 'text' }
153         is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
154 
155         # Skip sections with few ref lines!
156         if is_ref_sect || include_references?(rc, tc)
157           Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
158           refs.concat Refs.parse(body).to_a
159         end
160       end
161 
162       refs
163     end
164   else
165     Refs.parse(lines).to_a
166   end
167 end
sections(delimiter: "\n", spacer: ' ', **opts) click to toggle source
    # File lib/anystyle/document.rb
173 def sections(delimiter: "\n", spacer: ' ', **opts)
174   each_section.map do |(head, body)|
175     {
176       title: head.map { |tk|
177         display_chars(tk.value).lstrip.unicode_normalize
178       }.join(spacer),
179       text: body.map { |tk|
180         display_chars(tk.value).unicode_normalize
181       }.join(delimiter)
182     }
183   end
184 end
title(delimiter: " ", **opts) click to toggle source
    # File lib/anystyle/document.rb
186 def title(delimiter: " ", **opts)
187   lines.drop_while { |ln|
188     ln.label != 'title'
189   }.take_while { |ln|
190     ln.label == 'title'
191   }.map(&:value).join(delimiter)
192 end
to_a(encode: true, **opts) click to toggle source
Calls superclass method
    # File lib/anystyle/document.rb
133 def to_a(encode: true, **opts)
134   super(encode: encode, **opts)
135 end
to_h(**opts) click to toggle source
    # File lib/anystyle/document.rb
137 def to_h(**opts)
138   {
139     info: info,
140     meta: meta,
141     sections: sections(**opts),
142     title: title(**opts),
143     references: references(**opts)
144   }
145 end
to_s(delimiter: "\n", encode: false, tagged: false, **opts) click to toggle source
Calls superclass method
    # File lib/anystyle/document.rb
120 def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
121   if tagged
122     prev_label = nil
123     lines.map { |ln|
124       label = (ln.label == prev_label) ? '' : ln.label
125       prev_label = ln.label
126       '%.14s| %s' % ["#{label}              ", ln.value]
127     }.join(delimiter)
128   else
129     super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
130   end
131 end