class AnyStyle::Feature::Line
Public Instance Methods
classify(chars)
click to toggle source
# File lib/anystyle/feature/line.rb 27 def classify(chars) 28 case chars.lstrip 29 when /\.\s*\.\s*\.\s*\.|……+/, /\p{L}\s{5,}\d+$/ 30 :toc 31 when /^[\[\(]?\d+\.?[\]\)]?\s+\p{L}+/ 32 :list 33 when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/ 34 :title 35 when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.)|equation|graph|abb(ildung)?)/i 36 :cap 37 when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i 38 :num 39 when /copyright|©|rights reserved/i 40 :copyright 41 when /https?:\/\//i 42 :http 43 else 44 :none 45 end 46 end
observe(token, page:, seq:, **opts)
click to toggle source
# File lib/anystyle/feature/line.rb 4 def observe(token, page:, seq:, **opts) 5 chars = display_chars(token) 6 7 lttrs = count(chars, /\p{L}/) 8 upper = count(chars, /\p{Lu}/) 9 punct = count(chars, /[\p{Pd}:.,&\(\)"'”„’‚´«「『‘“`»」』]/) 10 white = count(chars, /\s/) 11 width = chars.length 12 13 [ 14 lttrs, 15 width, 16 ratio(upper, lttrs), 17 ratio(lttrs, chars.length), 18 ratio(white, chars.length), 19 ratio(punct, chars.length), 20 ratio(width, page.width), 21 classify(chars), 22 page_ratio(seq.line_counts[chars], seq.pages.length), 23 page_ratio(seq.nnum_counts[nnum(chars)], seq.pages.length) 24 ] 25 end
page_ratio(a, b)
click to toggle source
# File lib/anystyle/feature/line.rb 48 def page_ratio(a, b) 49 r = a.to_f / b 50 r == 1 ? '=' : r > 1 ? '+' : (r * 10).round 51 end