class PDF

a PDF parser

Attributes

hdr[RW]
off[RW]
str[RW]
trailer[RW]
xoff[RW]
xrefs[RW]

Public Class Methods

new(str=nil) click to toggle source
# File misc/pdfparse.rb, line 71
def initialize(str=nil)
        read str if str
end
read(filename) click to toggle source

reads a filename as a PDF using VString

# File misc/pdfparse.rb, line 67
def self.read(filename)
        new(VString.read(filename))
end

Public Instance Methods

deref(obj, depth=1) click to toggle source

dereference references from the specified root, with the specified depth

# File misc/pdfparse.rb, line 333
def deref(obj, depth=1)
        if obj.kind_of? Ref
                @xrefs[obj.gen] ||= {}
                if not nobj = @xrefs[obj.gen][obj.id]
                        pvoff = @off
                        raise 'unknown ref off' unless @off = @xoff[obj.gen][obj.id]
                        puts "deref #{obj.gen} #{obj.id} => #{@off.to_s(16)}" if $DEBUG
                        nobj = @xrefs[obj.gen][obj.id] = readany || :poil
                        @off = pvoff
                end
                obj = nobj
        end
        depth -= 1
        case obj
        when Hash; obj = obj.dup ; obj.each { |k, v| obj[k] = deref(v, depth) }
        when Array; obj = obj.dup ; obj.each_with_index { |v, i| obj[i] = deref(v, depth) }
        end if depth > 0
        obj
end
each_page(h=@trailer['Root']['Pages']) { |ps_page(page_data(h))| ... } click to toggle source

iterates over the PDF pages, yields each PSPage

# File misc/pdfparse.rb, line 363
def each_page(h=@trailer['Root']['Pages'])
        if h['Kids']
                h['Kids'].each { |k| each_page(k, &Proc.new) }
        else
                yield PSPage.new(page_data(h['Contents']))
        end
end
newstream(hash, data) click to toggle source
# File misc/pdfparse.rb, line 244
def newstream(hash, data)
        f = [hash['Filter']].flatten.compact
        if f.length == 1 and f.first == 'FlateDecode'
                data = Zlib::Inflate.inflate(data)
        elsif f.length == 0
        else  puts "stream filter #{f.inspect} unsupported"
        end
        hash[:data] = data
        hash
end
page(nr, ar=@trailer['Root']['Pages']['Kids']) click to toggle source

returns the nr-th page of the pdf as a PSPage

# File misc/pdfparse.rb, line 372
def page(nr, ar=@trailer['Root']['Pages']['Kids'])
        ar.each { |kid|
                if kid['Count']
                        break page(nr, kid['Kids']) if nr <= kid['Count']
                        nr -= kid['Count']
                else
                        nr -= 1
                        break PSPage.new(page_data(kid['Contents'])) if nr <= 0
                end
        }
end
page_data(ct) click to toggle source

returns the :data field for a Hash or the concatenation of the :data fields of the children for an Array

# File misc/pdfparse.rb, line 354
def page_data(ct)
        if deref(ct).kind_of? Array
                ct.map { |c| c[:data] }.join
        else
                ct[:data]
        end
end
read(str) click to toggle source

reads a string as a PDF, interpret basic informations (header, trailer, xref table)

# File misc/pdfparse.rb, line 76
def read(str)
        @str = str
        @off = 0
        readhdr
        raise 'bad pdf: no trailer' unless @off = @str.rindex("trailer", @str.length)
        readtrailer
        self
end
readany() click to toggle source

reads & returns any pdf object according to its 1st char (almost) updates @xrefs if the object is indirect

# File misc/pdfparse.rb, line 276
def readany
        case @str[@off, 1]
        when nil; return
        when '/'; readname
        when '+', '-'; readint
        when '0'..'9'
                i = readint
                if ('0'..'9').include?(@str[@off, 1])
                        poff = @off
                        g = readint
                        case readcmd
                        when 'obj'
                                @xrefs[g] ||= {}
                                i = @xrefs[g][i] ||= readany
                                raise 'no endobj' if readcmd != 'endobj'
                        when 'R'
                                i = Ref.new(self, g, i)
                        else @off = poff
                        end
                end
                i
        when '['; readarray
        when '('; readstr
        when '<'
                if @str[@off+1, 1] == '<'
                        h = readhash
                        if @str[@off, 6] == 'stream' and i = @str.index("\n", @off) # readcmd may eat spaces that are part of the stream
                                l = h['Length'].to_i
                                h = newstream(h, @str[i+1, l])
                                @off = i+1+l
                                skipspc
                                raise 'no endstream' if readcmd != 'endstream'
                        end
                        h
                else readstr
                end
        else
                case c = readcmd
                when 'true', 'false', 'null'; c.to_sym
                when 'xref'; readxrtable ; (@trailer ||= {}).update readhash if readcmd == 'trailer' ; readint if readcmd == 'startxref' ; :xref
                else raise "unknown cmd #{c.inspect}"
                end
        end
end
readarray() click to toggle source
# File misc/pdfparse.rb, line 208
def readarray
        return if @str[@off, 1] != '['
        buf = []
        @off += 1
        skipspc
        buf << readany until @str[@off, 1] == ']' or @off >= @str.length
        @off += 1
        skipspc
        buf
end
readcmd() click to toggle source
# File misc/pdfparse.rb, line 231
def readcmd
        buf = ''
        loop do
                case c = @str[@off, 1]
                when nil, /[\s\(\)\{\}<>\[\]\/%]/; break
                else buf << c
                end
                @off += 1
        end
        skipspc
        buf
end
readhash() click to toggle source
# File misc/pdfparse.rb, line 219
def readhash
        return if @str[@off, 2] != '<<'
        buf = {}
        @off += 2
        skipspc
        buf[readname] = readany until @str[@off, 2] == '>>' or @off >= @str.length
        buf.delete_if { |k, v| v == :null }
        @off += 2
        skipspc
        buf
end
readhdr() click to toggle source
# File misc/pdfparse.rb, line 85
def readhdr
        @hdr = @str[@off, @off = @str.index("\n", @off)]
end
readint() click to toggle source
# File misc/pdfparse.rb, line 126
def readint
        buf = ''
        loop do
                case c = @str[@off, 1]
                when '+', '-'; break if not buf.empty?
                when '.'; break if buf.include? '.'
                when '0'..'9'
                else break
                end
                buf << c
                @off += 1
        end
        return if buf.empty?
        skipspc
        buf.include?('.') ? buf.to_f : buf.to_i
end
readname() click to toggle source
# File misc/pdfparse.rb, line 193
def readname
        return if @str[@off, 1] != '/'
        buf = ''
        loop do
                @off += 1
                case c = @str[@off, 1]
                when '#'; buf << @str[@off+1, 2].to_i(16) ; @off += 2
                when nil, /[\s\(\)\{\}<>\[\]\/]/; break
                else buf << c
                end
        end
        skipspc
        buf
end
readstr() click to toggle source
# File misc/pdfparse.rb, line 143
def readstr
        buf = ''
        case @str[@off, 1]
        when '('
                nest = 0
                loop do
                        @off += 1
                        case c = @str[@off, 1]
                        when '('; nest += 1 ; buf << c
                        when ')'; nest -= 1 ; break if nest < 0 ; buf << c
                        when '\\'
                                @off += 1
                                case c = @str[@off, 1]
                                when 'n'; buf << ?\n
                                when 'r'; buf << ?\r
                                when 't'; buf << ?\t
                                when 'b'; buf << ?\b
                                when '0'..'7'
                                        if ('0'..'7').include?(cc = @str[@off+1, 1])
                                        @off += 1 ; c << cc
                                        if ('0'..'7').include?(cc = @str[@off+1, 1])
                                        @off += 1 ; c << cc
                                        end
                                        end
                                        buf << c.to_i(8)
                                when nil; break
                                else buf << c
                                end
                        when nil; break
                        else buf << c
                        end
                end
        when '<'
                loop do
                        @off += 1
                        case c = @str[@off, 1]
                        when '0'..'9', 'a'..'f', 'A'..'F'; buf << c
                        when ' ', "\n", "\r", "\t"
                        else break
                        end
                end
                buf << '0' if buf.length % 2 == 1
                buf = [buf].pack('H*')
        else return
        end
        @off += 1
        skipspc
        buf
end
readtrailer() click to toggle source

reads the pdf trailer XXX the xref table referenced here may be the first of the file, so we suppose the last is just before the 'trailer' command..

# File misc/pdfparse.rb, line 91
def readtrailer
        toff = @off
        readcmd
        @trailer = readhash
        readcmd
        @xroff = readint
        @xoff = {}    # [gen] => { id => off }
        @xrefs = {}   # [gen] => { id => obj }
        @off = @xroff
        readcmd
        readxrtable
        off2 = @off
        if @off < toff and readcmd == 'trailer' and off = @str.rindex('xref', toff)
                @off = off
                readcmd
                readxrtable
                @off = off2
                readcmd
                @trailer.update readhash
        end
end
readxrtable() click to toggle source
# File misc/pdfparse.rb, line 113
def readxrtable
        while @str[@off, 7] != 'trailer'
                objnr = readint
                objcnt = readint
                @str[@off, 20*objcnt].scan(/(\d+) (\d+) (.)/) { |o, g, u|
                        (@xoff[g.to_i] ||= {})[objnr] = o.to_i if u == 'n'
                        objnr += 1
                }
                @off += 20*objcnt
                skipspc
        end
end
skipspc() click to toggle source
# File misc/pdfparse.rb, line 321
def skipspc
        while @off < @str.length
                case @str[@off, 1]
                when '%'; @off += 1 until @str[@off, 1] == "\n" or @off >= @str.length
                when ' ', "\n", "\r", "\t"
                else break
                end
                @off += 1
        end
end