class AnyStyle::Document
Constants
- REFSECT
Attributes
info[RW]
lines[RW]
meta[RW]
pages[RW]
path[RW]
tokens[RW]
Public Class Methods
open(path, format: File.extname(path), tagged: false, **opts)
click to toggle source
# File lib/anystyle/document.rb 20 def open(path, format: File.extname(path), tagged: false, **opts) 21 raise ArgumentError, 22 "cannot open tainted path: '#{path}'" if path.tainted? 23 raise ArgumentError, 24 "document not found: '#{path}'" unless File.exist?(path) 25 26 path = File.absolute_path(path) 27 28 case format.downcase 29 when '.pdf' 30 meta = pdf_meta path, **opts if opts[:parse_meta] 31 info = pdf_info path, **opts if opts[:parse_info] 32 input = pdf_to_text path, **opts 33 when '.ttx' 34 tagged = true 35 input = File.read(path, encoding: 'utf-8') 36 when '.txt' 37 input = File.read(path, encoding: 'utf-8') 38 end 39 40 doc = parse input, tagged: tagged 41 doc.path = path 42 doc.meta = meta 43 doc.info = info 44 doc 45 end
parse(string, delimiter: /\r?\n/, tagged: false)
click to toggle source
# File lib/anystyle/document.rb 9 def parse(string, delimiter: /\r?\n/, tagged: false) 10 current_label = '' 11 new(string.split(delimiter).map { |line| 12 if tagged 13 label, line = line.split(/\s*\| /, 2) 14 current_label = label unless label.empty? 15 end 16 Wapiti::Token.new line, label: current_label.to_s 17 }) 18 end
Public Instance Methods
each() { |line, ln, page, pn| ... }
click to toggle source
# File lib/anystyle/document.rb 65 def each 66 if block_given? 67 pages.each.with_index do |page, pn| 68 page.lines.each.with_index do |line, ln| 69 yield line, ln, page, pn 70 end 71 end 72 self 73 else 74 to_enum 75 end 76 end
each_section(skip: ['meta']) { |head, body| ... }
click to toggle source
# File lib/anystyle/document.rb 78 def each_section(skip: ['meta']) 79 if block_given? 80 head = [] 81 body = [] 82 seen_content = false 83 84 lines.each do |ln| 85 case ln.label 86 when 'title' 87 if seen_content 88 yield [head, body] 89 head, body, seen_content = [ln], [], false 90 else 91 head << ln 92 end 93 when 'ref', 'text' 94 body << ln 95 seen_content = true 96 else 97 body << ln unless skip.include?(ln.label) 98 end 99 end 100 unless head.empty? 101 yield [head, body] 102 end 103 self 104 else 105 to_enum :each_section 106 end 107 end
include_references?(rc, tc)
click to toggle source
# File lib/anystyle/document.rb 169 def include_references?(rc, tc) 170 rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2 171 end
inspect()
click to toggle source
# File lib/anystyle/document.rb 194 def inspect 195 "#<AnyStyle::Document lines={#{size}}>" 196 end
label(other)
click to toggle source
# File lib/anystyle/document.rb 109 def label(other) 110 doc = dup 111 doc.tokens = lines.map.with_index { |line, idx| 112 Wapiti::Token.new line.value, 113 label: other[idx].label.to_s, 114 observations: other[idx].observations.dup, 115 score: other[idx].score 116 } 117 doc 118 end
line_counts()
click to toggle source
# File lib/anystyle/document.rb 53 def line_counts 54 @line_counts ||= Hash.new(0) 55 end
nnum_counts()
click to toggle source
# File lib/anystyle/document.rb 57 def nnum_counts 58 @nnum_counts ||= Hash.new(0) 59 end
references(normalize_blocks: false, **opts)
click to toggle source
# File lib/anystyle/document.rb 147 def references(normalize_blocks: false, **opts) 148 if normalize_blocks 149 each_section.inject([]) do |refs, (head, body)| 150 rc = body.count { |tk| tk.label == 'ref' } 151 unless rc == 0 152 tc = body.count { |tk| tk.label == 'text' } 153 is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil? 154 155 # Skip sections with few ref lines! 156 if is_ref_sect || include_references?(rc, tc) 157 Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2 158 refs.concat Refs.parse(body).to_a 159 end 160 end 161 162 refs 163 end 164 else 165 Refs.parse(lines).to_a 166 end 167 end
sections(delimiter: "\n", spacer: ' ', **opts)
click to toggle source
# File lib/anystyle/document.rb 173 def sections(delimiter: "\n", spacer: ' ', **opts) 174 each_section.map do |(head, body)| 175 { 176 title: head.map { |tk| 177 display_chars(tk.value).lstrip.unicode_normalize 178 }.join(spacer), 179 text: body.map { |tk| 180 display_chars(tk.value).unicode_normalize 181 }.join(delimiter) 182 } 183 end 184 end
title(delimiter: " ", **opts)
click to toggle source
# File lib/anystyle/document.rb 186 def title(delimiter: " ", **opts) 187 lines.drop_while { |ln| 188 ln.label != 'title' 189 }.take_while { |ln| 190 ln.label == 'title' 191 }.map(&:value).join(delimiter) 192 end
to_a(encode: true, **opts)
click to toggle source
Calls superclass method
# File lib/anystyle/document.rb 133 def to_a(encode: true, **opts) 134 super(encode: encode, **opts) 135 end
to_h(**opts)
click to toggle source
# File lib/anystyle/document.rb 137 def to_h(**opts) 138 { 139 info: info, 140 meta: meta, 141 sections: sections(**opts), 142 title: title(**opts), 143 references: references(**opts) 144 } 145 end
to_s(delimiter: "\n", encode: false, tagged: false, **opts)
click to toggle source
Calls superclass method
# File lib/anystyle/document.rb 120 def to_s(delimiter: "\n", encode: false, tagged: false, **opts) 121 if tagged 122 prev_label = nil 123 lines.map { |ln| 124 label = (ln.label == prev_label) ? '' : ln.label 125 prev_label = ln.label 126 '%.14s| %s' % ["#{label} ", ln.value] 127 }.join(delimiter) 128 else 129 super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts) 130 end 131 end